从awk脚本中,我想生成一个HTML文件。我的字符串可能包含像“<”和“&"这样的字符。有没有一个简短的、经过验证的awk函数来执行转义?
atmip9wb1#
为了逃避最低限度,你可以这样做:
function escapeHtml(t){ # Must do this one first gsub(/&/, "\\&", t); gsub(/"/, "\\"", t) gsub(/</, "\\<", t); gsub(/>/, "\\>", t); return t;}
function escapeHtml(t)
{
# Must do this one first
gsub(/&/, "\\&", t);
gsub(/"/, "\\"", t)
gsub(/</, "\\<", t);
gsub(/>/, "\\>", t);
return t;
}
字符串
vwkv1x7d2#
当然。只需为您想要转换的每一行调用makeEntities()($0)。或者修改它以接受参数。我这样做是为了使用英国国家语料库,它与HTML实体有高度的重叠,但 * 不是100%*,所以如果您需要一些外来字符,您应该验证它们是否正确。
makeEntities()
$0
function makeEntities() { gsub(/á/, "\\á"); gsub(/Á/, "\\Á"); gsub(/ă/, "\\ă"); gsub(/â/, "\\â"); gsub(/´/, "\\´"); gsub(/æ/, "\\æ"); gsub(/Æ/, "\\Æ"); gsub(/α/, "\\&agr;"); gsub(/à/, "\\à"); gsub(/ā/, "\\ā"); gsub(/Ā/, "\\Ā"); gsub(/&/, "\\&"); gsub(/ą/, "\\ą"); gsub(/å/, "\\å"); gsub(/Å/, "\\Å"); gsub(/ã/, "\\ã"); gsub(/ä/, "\\ä"); gsub(/Ä/, "\\Ä"); gsub(/β/, "\\&bgr;"); gsub(/\\/, "\\\"); gsub(/•/, "\\•"); gsub(/ć/, "\\ć"); gsub(/č/, "\\č"); gsub(/Č/, "\\Č"); gsub(/ç/, "\\ç"); gsub(/Ç/, "\\Ç"); gsub(/ĉ/, "\\ĉ"); gsub(/✓/, "\\✓"); gsub(/ˆ/, "\\ˆ"); gsub(/@/, "\\@"); gsub(/©/, "\\©"); gsub(/‐/, "\\‐"); gsub(/ď/, "\\ď"); gsub(/°/, "\\°"); gsub(/δ/, "\\&dgr;"); gsub(/Δ/, "\\&Dgr;"); gsub(/¨/, "\\¨"); gsub(/\$/, "\\$"); gsub(/đ/, "\\đ"); gsub(/é/, "\\é"); gsub(/É/, "\\É"); gsub(/ě/, "\\ě"); gsub(/ê/, "\\ê"); gsub(/è/, "\\è"); gsub(/È/, "\\È"); gsub(/ε/, "\\&egr;"); gsub(/ē/, "\\ē"); gsub(/Ē/, "\\Ē"); gsub(/ę/, "\\ę"); gsub(/ð/, "\\ð"); gsub(/ë/, "\\ë"); gsub(/Ë/, "\\Ë"); gsub(/♭/, "\\♭"); gsub(/½/, "\\½"); gsub(/⅓/, "\\⅓"); gsub(/¼/, "\\¼"); gsub(/⅕/, "\\⅕"); gsub(/⅙/, "\\⅙"); gsub(/⅛/, "\\⅛"); gsub(/⅔/, "\\⅔"); gsub(/⅖/, "\\⅖"); gsub(/¾/, "\\¾"); gsub(/⅗/, "\\⅗"); gsub(/⅜/, "\\⅜"); gsub(/⅘/, "\\⅘"); gsub(/⅝/, "\\⅝"); gsub(/⅞/, "\\⅞"); gsub(/′/, "\\&ft;"); gsub(/γ/, "\\&ggr;"); gsub(/>/, "\\>"); gsub(/½/, "\\½"); gsub(/ħ/, "\\ħ"); gsub(/í/, "\\í"); gsub(/Í/, "\\Í"); gsub(/î/, "\\î"); gsub(/Î/, "\\Î"); gsub(/ì/, "\\ì"); gsub(/ī/, "\\ī"); gsub(/″/, "\\&ins;"); gsub(/¿/, "\\¿"); gsub(/ï/, "\\ï"); gsub(/Ï/, "\\Ï"); gsub(/ĺ/, "\\ĺ"); gsub(/Ĺ/, "\\Ĺ"); gsub(/\{/, "\\{"); gsub(/≤/, "\\≤"); gsub(/λ/, "\\&lgr;"); gsub(/_/, "\\_"); gsub(/\[/, "\\["); gsub(/ł/, "\\ł"); gsub(/Ł/, "\\Ł"); gsub(/</, "\\<"); gsub(/—/, "\\—"); gsub(/μ/, "\\&mgr;"); gsub(/µ/, "\\µ"); gsub(/·/, "\\·"); gsub(/ń/, "\\ń"); gsub(/ň/, "\\ň"); gsub(/ņ/, "\\ņ"); gsub(/–/, "\\–"); gsub(/ñ/, "\\ñ"); gsub(/Ñ/, "\\Ñ"); gsub(/#/, "\\#"); gsub(/ó/, "\\ó"); gsub(/Ó/, "\\Ó"); gsub(/ô/, "\\ô"); gsub(/œ/, "\\œ"); gsub(/ò/, "\\ò"); gsub(/Ω/, "\\Ω"); gsub(/ō/, "\\ō"); gsub(/ø/, "\\ø"); gsub(/Ø/, "\\Ø"); gsub(/õ/, "\\õ"); gsub(/ö/, "\\ö"); gsub(/Ö/, "\\Ö"); gsub(/φ/, "\\&phgr;"); gsub(/\+/, "\\+"); gsub(/±/, "\\±"); gsub(/£/, "\\£"); gsub(/ŕ/, "\\ŕ"); gsub(/√/, "\\√"); gsub(/ř/, "\\ř"); gsub(/Ř/, "\\Ř"); gsub(/\}/, "\\}"); gsub(/®/, "\\®"); gsub(/-/, "\\&rehy;"); gsub(/\]/, "\\]"); gsub(/ś/, "\\ś"); gsub(/Ś/, "\\Ś"); gsub(/š/, "\\š"); gsub(/Š/, "\\Š"); gsub(/ş/, "\\ş"); gsub(/Ş/, "\\Ş"); gsub(/ŝ/, "\\ŝ"); gsub(/σ/, "\\&sgr;"); gsub(/♯/, "\\♯"); gsub(/\//, "\\&shilling;"); gsub(/∼/, "\\∼"); gsub(/\//, "\\/"); gsub(/²/, "\\²"); gsub(/ß/, "\\ß"); gsub(/ť/, "\\ť"); gsub(/ţ/, "\\ţ"); gsub(/τ/, "\\&tgr;"); gsub(/þ/, "\\þ"); gsub(/Þ/, "\\Þ"); gsub(/×/, "\\×"); gsub(/™/, "\\™"); gsub(/ú/, "\\ú"); gsub(/Ú/, "\\Ú"); gsub(/û/, "\\û"); gsub(/ù/, "\\ù"); gsub(/ū/, "\\ū"); gsub(/¨/, "\\¨"); gsub(/ů/, "\\ů"); gsub(/ü/, "\\ü"); gsub(/Ü/, "\\Ü"); gsub(/\|/, "\\|"); gsub(/ŵ/, "\\ŵ"); gsub(/ý/, "\\ý"); gsub(/ŷ/, "\\ŷ"); gsub(/¥/, "\\¥"); gsub(/ÿ/, "\\ÿ"); gsub(/Ÿ/, "\\Ÿ"); gsub(/ź/, "\\ź"); gsub(/Ž/, "\\Ž"); gsub(/ž/, "\\ž"); gsub(/ż/, "\\ż");}
function makeEntities() {
gsub(/á/, "\\á");
gsub(/Á/, "\\Á");
gsub(/ă/, "\\ă");
gsub(/â/, "\\â");
gsub(/´/, "\\´");
gsub(/æ/, "\\æ");
gsub(/Æ/, "\\Æ");
gsub(/α/, "\\&agr;");
gsub(/à/, "\\à");
gsub(/ā/, "\\ā");
gsub(/Ā/, "\\Ā");
gsub(/&/, "\\&");
gsub(/ą/, "\\ą");
gsub(/å/, "\\å");
gsub(/Å/, "\\Å");
gsub(/ã/, "\\ã");
gsub(/ä/, "\\ä");
gsub(/Ä/, "\\Ä");
gsub(/β/, "\\&bgr;");
gsub(/\\/, "\\\");
gsub(/•/, "\\•");
gsub(/ć/, "\\ć");
gsub(/č/, "\\č");
gsub(/Č/, "\\Č");
gsub(/ç/, "\\ç");
gsub(/Ç/, "\\Ç");
gsub(/ĉ/, "\\ĉ");
gsub(/✓/, "\\✓");
gsub(/ˆ/, "\\ˆ");
gsub(/@/, "\\@");
gsub(/©/, "\\©");
gsub(/‐/, "\\‐");
gsub(/ď/, "\\ď");
gsub(/°/, "\\°");
gsub(/δ/, "\\&dgr;");
gsub(/Δ/, "\\&Dgr;");
gsub(/¨/, "\\¨");
gsub(/\$/, "\\$");
gsub(/đ/, "\\đ");
gsub(/é/, "\\é");
gsub(/É/, "\\É");
gsub(/ě/, "\\ě");
gsub(/ê/, "\\ê");
gsub(/è/, "\\è");
gsub(/È/, "\\È");
gsub(/ε/, "\\&egr;");
gsub(/ē/, "\\ē");
gsub(/Ē/, "\\Ē");
gsub(/ę/, "\\ę");
gsub(/ð/, "\\ð");
gsub(/ë/, "\\ë");
gsub(/Ë/, "\\Ë");
gsub(/♭/, "\\♭");
gsub(/½/, "\\½");
gsub(/⅓/, "\\⅓");
gsub(/¼/, "\\¼");
gsub(/⅕/, "\\⅕");
gsub(/⅙/, "\\⅙");
gsub(/⅛/, "\\⅛");
gsub(/⅔/, "\\⅔");
gsub(/⅖/, "\\⅖");
gsub(/¾/, "\\¾");
gsub(/⅗/, "\\⅗");
gsub(/⅜/, "\\⅜");
gsub(/⅘/, "\\⅘");
gsub(/⅝/, "\\⅝");
gsub(/⅞/, "\\⅞");
gsub(/′/, "\\&ft;");
gsub(/γ/, "\\&ggr;");
gsub(/>/, "\\>");
gsub(/½/, "\\½");
gsub(/ħ/, "\\ħ");
gsub(/í/, "\\í");
gsub(/Í/, "\\Í");
gsub(/î/, "\\î");
gsub(/Î/, "\\Î");
gsub(/ì/, "\\ì");
gsub(/ī/, "\\ī");
gsub(/″/, "\\&ins;");
gsub(/¿/, "\\¿");
gsub(/ï/, "\\ï");
gsub(/Ï/, "\\Ï");
gsub(/ĺ/, "\\ĺ");
gsub(/Ĺ/, "\\Ĺ");
gsub(/\{/, "\\{");
gsub(/≤/, "\\≤");
gsub(/λ/, "\\&lgr;");
gsub(/_/, "\\_");
gsub(/\[/, "\\[");
gsub(/ł/, "\\ł");
gsub(/Ł/, "\\Ł");
gsub(/</, "\\<");
gsub(/—/, "\\—");
gsub(/μ/, "\\&mgr;");
gsub(/µ/, "\\µ");
gsub(/·/, "\\·");
gsub(/ń/, "\\ń");
gsub(/ň/, "\\ň");
gsub(/ņ/, "\\ņ");
gsub(/–/, "\\–");
gsub(/ñ/, "\\ñ");
gsub(/Ñ/, "\\Ñ");
gsub(/#/, "\\#");
gsub(/ó/, "\\ó");
gsub(/Ó/, "\\Ó");
gsub(/ô/, "\\ô");
gsub(/œ/, "\\œ");
gsub(/ò/, "\\ò");
gsub(/Ω/, "\\Ω");
gsub(/ō/, "\\ō");
gsub(/ø/, "\\ø");
gsub(/Ø/, "\\Ø");
gsub(/õ/, "\\õ");
gsub(/ö/, "\\ö");
gsub(/Ö/, "\\Ö");
gsub(/φ/, "\\&phgr;");
gsub(/\+/, "\\+");
gsub(/±/, "\\±");
gsub(/£/, "\\£");
gsub(/ŕ/, "\\ŕ");
gsub(/√/, "\\√");
gsub(/ř/, "\\ř");
gsub(/Ř/, "\\Ř");
gsub(/\}/, "\\}");
gsub(/®/, "\\®");
gsub(/-/, "\\&rehy;");
gsub(/\]/, "\\]");
gsub(/ś/, "\\ś");
gsub(/Ś/, "\\Ś");
gsub(/š/, "\\š");
gsub(/Š/, "\\Š");
gsub(/ş/, "\\ş");
gsub(/Ş/, "\\Ş");
gsub(/ŝ/, "\\ŝ");
gsub(/σ/, "\\&sgr;");
gsub(/♯/, "\\♯");
gsub(/\//, "\\&shilling;");
gsub(/∼/, "\\∼");
gsub(/\//, "\\/");
gsub(/²/, "\\²");
gsub(/ß/, "\\ß");
gsub(/ť/, "\\ť");
gsub(/ţ/, "\\ţ");
gsub(/τ/, "\\&tgr;");
gsub(/þ/, "\\þ");
gsub(/Þ/, "\\Þ");
gsub(/×/, "\\×");
gsub(/™/, "\\™");
gsub(/ú/, "\\ú");
gsub(/Ú/, "\\Ú");
gsub(/û/, "\\û");
gsub(/ù/, "\\ù");
gsub(/ū/, "\\ū");
gsub(/¨/, "\\¨");
gsub(/ů/, "\\ů");
gsub(/ü/, "\\ü");
gsub(/Ü/, "\\Ü");
gsub(/\|/, "\\|");
gsub(/ŵ/, "\\ŵ");
gsub(/ý/, "\\ý");
gsub(/ŷ/, "\\ŷ");
gsub(/¥/, "\\¥");
gsub(/ÿ/, "\\ÿ");
gsub(/Ÿ/, "\\Ÿ");
gsub(/ź/, "\\ź");
gsub(/Ž/, "\\Ž");
gsub(/ž/, "\\ž");
gsub(/ż/, "\\ż");
2条答案
按热度按时间atmip9wb1#
为了逃避最低限度,你可以这样做:
字符串
vwkv1x7d2#
当然。只需为您想要转换的每一行调用
makeEntities()
($0
)。或者修改它以接受参数。我这样做是为了使用英国国家语料库,它与HTML实体有高度的重叠,但 * 不是100%*,所以如果您需要一些外来字符,您应该验证它们是否正确。字符串