Those filters (as well as all my gawk-code) are always written on one line of code and I always use them like that, but some people like indents. The indented versions of those filters are automatically converted with pgawk, which sometimes adds control bytes (e.g. 0x00) in gawk scripts, which doesn't always work that well. I recommend the original non-indented scripts because they do not contain such control characters and thus avoid these problems.
These filters output a minimal number of new lines. Use adjlines.awk below if you want several lines. The reason of my avoiding new lines when possible (e.g. when outputting HTML) is to skip the problem with different systems using many sorts of line break characters, for example : • U+000A line feed (LF), • U+000D carriage return (CR), • U+0085 next line (NEL), • U+2028 line separator and • U+2029 paragraph separator
addligat.awk (or if you like indents and dislike comments maybe addligat.awk.indents works) | |
purpose | Adds HTML code for ligatures in UTF-8 encoded HTML or text files avoiding changing the content of HTML tags and entities. Also works with ASCII encoded HTML file where some extra conversions occur, for example to add circled digits and letters as follows: digits 0x24ea (0), 0x2460-0x2473 (1-20), 0x3251-0x325f (21-35), 0x32b1-0x32bf (36-50); reversed 0x24ff (0), 0x2776-0x277f (1-10), 0x24eb-0x24f4 (11-20); double 0x24f5-0x24fe (1-10); latin 0x24b6-0x24cf (A-Z), 0x24d0-0x24e9 (a-z). Read the code for details. |
code | BEGIN{anf="^(([^<]*(<([^\042\047>]|\042[^\042]*\042|\047[^\047]*\047)*>))*[^<&]*((&[a-zA-Z#][a-zA-Z0-9]+[; ])[^<&]*)*)";antlig=split("st;st;ffl;ffi;fl;fi;ff",tkn,";");bn="\134\133• ";uta=bn "(([1234]?[[:digit:]])|(50))\134\135";utb=bn "\\.((1?[[:digit:]])|(20))\134\135"} {while((nmr=gensub(anf uta,"\\7|","1"))!=$0){nmr+=0;$0=gensub(anf uta,"\\1\\&#" nmr+((nmr>35)?12941:((nmr>20)?12860:((nmr<1)?9450:9311))) ";","1")};while((nmr=gensub(anf utb,"\\7|","1"))!=$0){nmr+=0;$0=gensub(anf utb,"\\1\\&#" nmr+((nmr>10)?9440:((nmr<1)?9471:10101))";","1")};$0=gensub("\134\133•\134\135","\\◉","g");for(i=97;i<123;i++){$0=gensub(anf bn sprintf("%c",i-32) "\134\135","\\1\\&#" i+9301 ";","g");$0=gensub(anf bn sprintf("%c",i) "\134\135","\\1\\&#" i+9327 ";","g")}; for(i=1;i<=antlig;i++){while($0~(tlf=anf tkn[i])){$0=gensub(tlf,"\\1\\&#" 64263-i ";","1")}};print} |
run-example | cat filename.txt | cp2htmlz.awk | addligat.awk |
adjlines.awk (or if you like indents and dislike comments maybe adjlines.awk.indents works) | |
purpose | adjusts line lengths in a text file to the longest possible line length, but at most to 63 bytes. No word is cut down, thus words longer than 63 bytes are preserved and thus some output lines may be longer than 63 bytes. If paragraphs already exists in the text file, you may filter with mkparagr.awk first |
code | BEGIN{favoradl=63} {gsub(/ +/," ");while(length($0)>favoradl){punkt=favoradl+1; if(substr($0,1,punkt)~" ") {while(substr($0,punkt,1)!=" "){punkt-=1}} else {punkt=index($0 " "," ")}; print(substr($0,1,punkt-1));$0=substr($0 " " " ",punkt+1); while(substr($0,length($0))==" "){$0=substr($0,1,length($0)-1)}};if($0!=""){if(RT==""){printf "%s",$0}else{print $0 "\n"}}} |
run-example | cat filename.txt | mkparagr.awk | adjlines.awk > filename.htm |
catnline.awk (or if you like indents and dislike comments maybe catnline.awk.indents works) | |
purpose | Filters so that end of lines, apostrophs and backslashes are escaped. You may output this later by: echo 'TheOutputOfCatnline' |
code | {if(NR>1){printf "\\n"};gsub(/\\/,"\\\\");gsub(/\041/,"\\041");gsub(/\047/,"\\047");printf "%s", $0} |
run-example | catnline file.txt |
checkmir.awk (or if you like indents and dislike comments maybe checkmir.awk.indents works) | |
purpose | checks that parentheses and other mirror characters in an ISO_8859, CP_1252 or ASCII file occur in pairs. Prints lines where this is not the case, beginning with the line number between the mirror characters |
code | BEGIN{peili="(;);[;];{;};-\240;\240-;<B>;</B>;<EM>;</EM>;<I>;</I>;<S>;</S>;<STRIKE>;</STRIKE>;<SUB>;</SUB>;<SUP>;</SUP>;<TT>;</TT>;<U>;</U>;\227\240;\240\227";if(length(sprintf("%c%c%c%c",0,16,216,128))==3){gsub(/\240/,"\302\240");gsub(/\227/,"\342\200\224")};antalspeglar=split(peili,tkn,";")/2;IGNORECASE=1}{for(peili=1;peili<=antalspeglar;peili++){if((split($0,slask,tkn[peili*2-1])!=split($0,slask,tkn[peili*2]))||(index($0,tkn[peili*2-1] tkn[peili*2]))||($0~/""/)){print tkn[peili*2-1] NR tkn[peili*2] " " $0}} if((split($0,slask,"\"")%2==0)&&(length($0)>0)){print "\"" NR "\"" " " $0}} |
checknas.awk (or if you like indents and dislike comments maybe checknas.awk.indents works) | |
purpose | checks for non-ascii characters in a file |
code | {for(tkn=128;tkn<256;tkn++){if($0~sprintf("%c",tkn)){print NR " " tkn}}} |
chsguess.awk (or if you like indents and dislike comments maybe chsguess.awk.indents works) | |
purpose | Outputs the name of possible charsets coding a given character as a given byte (group). Notice that different versions of "echo" may behave in different ways in different environments when trying to send unique bytes. For this reason, it may be better to use printf as in the following examples. This filter works in both UTF-8, ISO_8859-1 and CP-1252 environments |
code | BEGIN{RS="";FS=ORS=" ";gc="iconv -l";gc | getline;close(gc);gsub(/[ ,\/\n]+/," ");split($0,ct);for(i in ct){cs[ct[i]]=""};ocs=substr("cp1252utf8",13-6*length("\303\244"),6)}{for(i in cs){ch="";gc="echo \047" gensub(/\047/,"\\\\" "047","g",$1) "\047 | iconv -c -f \"" i "\" -t " ocs " - 2> /dev/null";gc | getline ch;close(gc);if(ch==$2){print i}};printf "\n"} |
run-example | printf "OneByteOrByteGroup OneCharacter" | chsguess.awk |
cp2htmla.awk (or if you like indents and dislike comments maybe cp2htmla.awk.indents works) | |
purpose | convert the four ASCII characters &, <, > and " to their HTML encodings in a text file. See cp2htmlc.awk for UTF-8, ISO_8859-1 and CP-1252 to HTML conversions |
code | {gsub(/&/,"&");gsub(/</,"\\<");gsub(/>/,"\\>");gsub(/\"/,"\\"");print} |
cp2htmlc.awk (or if you like indents and dislike comments maybe cp2htmlc.awk.indents works) | |
purpose | a text2xhtml / text2html filter converting text to (x)html. Uses 0xfffd for unknown and control characters. Doesn't change the four characters <>&" so the text file may already include some ASCII encoded HTML code — if you need to convert those four characters, please filter with 'cp2htmla.awk' first, then with 'cp2htmlc.awk' (see example). Assumes that paragraphs in input text file are separated by an empty line. If this is not the case, filter with 'xtraline.awk' first (see example). For special HTML-formatings, see 'cp2htmld.awk', 'cp2htmll.awk', 'cp2htmlp.awk', 'cp2htmls.awk', 'cp2htmlu.awk', 'cp2htmly.awk' and 'cp2htmlz.awk' (or use switches -D, -L, -P, -S, -U, -Y and -Z respectively). In a UTF-8 environment, assumes the input file is UTF-8 encoded text and outputs a UTF-8 encoded XHTML+RDFa 1.0 file; for the output file to be readable with old (or badly uppdated) browsers, give the value 1 to the age variable at the beginning of the filter. In an eight bits characters environment, assumes that the input text file is ISO_8859-1 or CP-1252 encoded and outputs an ASCII-encoded HTML 4.01 file. |
code | BEGIN{age=1;argoval="C";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length(sprintf("%c%c%c%c",0,16,216,128)))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0.01cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};if(stt!=""){stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE>"};stt=stt "<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\"http://www.acc.umu.se/~saasha/gemensam.css\" /></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"> </script></head><body>"};if(chs==3){if(age==0){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?>"}else{stt="<meta http-equiv=\"Content-type\" content=\"text/html; charset=" nt["US-ASCII"] "\" />" stt};printf "%s","<\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\" \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" version=\"XHTML+RDFa 1.0\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){sub(/ \/>/,">",stt);printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="b"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="t"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if((($0~/^[^ ]+[\(\[]/)||($0~/\014/))&&(cla!="c")){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/(nings)(s[[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | cat filename.txt | xtraline.awk | cp2htmla.awk | cp2htmlc.awk | addligat.awk > filename.htm |
cp2htmld.awk (or if you like indents and dislike comments maybe cp2htmld.awk.indents works) | |
purpose | mostly like 'cp2htmlc.awk -D', but cp2htmld.awk tries to guess some features in the input text to make a 'default' formating |
code | BEGIN{age=1;argoval="D";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0.01cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};if(stt!=""){stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE>"};stt=stt "<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\"http://www.acc.umu.se/~saasha/gemensam.css\" /></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"> </script></head><body>"};if(chs==3){if(age==0){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?>"}else{stt="<meta http-equiv=\"Content-type\" content=\"text/html; charset=" nt["US-ASCII"] "\" />" stt};printf "%s","<\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\" \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" version=\"XHTML+RDFa 1.0\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){sub(/ \/>/,">",stt);printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="b"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="t"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/(nings)(s[[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
cp2htmll.awk (or if you like indents and dislike comments maybe cp2htmll.awk.indents works) | |
purpose | mostly like 'cp2htmlc.awk -L', but cp2htmll.awk is a paper-saving alternative for text you wish to print out |
code | BEGIN{argoval="L";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0.01cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"></script></head><body>"};if(chs==3){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?><\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="l"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="t"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
cp2htmlp.awk (or if you like indents and dislike comments maybe cp2htmlp.awk.indents works) | |
purpose | WORK-IN-PROGRESS mostly like 'cp2htmlc.awk -P', but in UTF-8 environments 'cp2htmlp.awk' is a special-formating alternative to output slidy-presentations. In an eight bits characters environment (NON utf-8), 'cp2htmlp.awk' does the same work as 'cp2htmlc.awk'. |
code | BEGIN{argoval="P";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"></script></head><body>"};if(chs==3){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?><\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="l"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="b"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | grep "something" filename.txt | cp2htmlp.awk |
cp2htmls.awk (or if you like indents and dislike comments maybe cp2htmls.awk.indents works) | |
purpose | mostly like 'cp2htmlc.awk -S', but cp2htmls.awk is a special-formating alternative to output in columns |
code | BEGIN{argoval="S";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"></script></head><body>"};if(chs==3){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?><\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="l"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="b"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | grep "something" filename.txt | cp2htmls.awk |
cp2htmlu.awk (or if you like indents and dislike comments maybe cp2htmlu.awk.indents works) | |
purpose | mostly like 'cp2htmlc.awk -U', but cp2htmlu.awk is a special-formating alternative |
code | BEGIN{age=1;argoval="U";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length(sprintf("%c%c%c%c",0,16,216,128)))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0.01cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};if(stt!=""){stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE>"};stt=stt "<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\"http://www.acc.umu.se/~saasha/gemensam.css\" /></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"> </script></head><body>"};if(chs==3){if(age==0){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?>"}else{stt="<meta http-equiv=\"Content-type\" content=\"text/html; charset=" nt["US-ASCII"] "\" />" stt};printf "%s","<\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\" \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" version=\"XHTML+RDFa 1.0\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){sub(/ \/>/,">",stt);printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="b"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="t"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if((($0~/^[^ ]+[\(\[]/)||($0~/\014/))&&(cla!="c")){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/(nings)(s[[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | grep "something" filename.txt | mkutkast.awk | cp2htmlu.awk |
cp2htmly.awk (or if you like indents and dislike comments maybe cp2htmly.awk.indents works) | |
purpose | mostly like 'cp2htmlc.awk -Y', but cp2htmly.awk assumes that the input file already has P, UL, LI and / or Hx tags — no such tag is added |
code | BEGIN{argoval="Y";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"></script></head><body>"};if(chs==3){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?><\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="l"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="b"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | grep "something" filename.txt | cp2htmly.awk |
cp2htmlz.awk (or if you like indents and dislike comments maybe cp2htmlz.awk.indents works) | |
purpose | mostly like 'cp2htmly.awk -Z', but cp2htmlz.awk doesn't add html, head or body tags — no such tag is added. cp2htmlz.awk is quite useless in an UTF-8 environment |
code | BEGIN{argoval="Z";if((ARGV[1]~/^-/)&&(length(ARGV[1])==2)){chs=toupper(substr(ARGV[1],2,1));delete ARGV[1];if("CDLPSUYZ"~chs){argoval=chs}};FS=":";RS="";if(((chs=length("\000\020\303\200"))==4)&&(argoval=="P")){argoval="C"};split("euro:#xfffd:sbquo:fnof:bdquo:hellip:dagger:Dagger:circ:permil:Scaron:lsaquo:OElig:#xfffd:#x017d:#xfffd:#xfffd:lsquo:rsquo:ldquo:rdquo:bull:ndash:mdash:tilde:trade:scaron:rsaquo:oelig:#xfffd:#x017e:Yuml:nbsp:iexcl:cent:pound:curren:yen:brvbar:sect:uml:copy:ordf:laquo:not:shy:reg:macr:deg:plusmn:sup2:sup3:acute:micro:para:middot:cedil:sup1:ordm:raquo:frac14:frac12:frac34:iquest:Agrave:Aacute:Acirc:Atilde:Auml:Aring:AElig:Ccedil:Egrave:Eacute:Ecirc:Euml:Igrave:Iacute:Icirc:Iuml:ETH:Ntilde:Ograve:Oacute:Ocirc:Otilde:Ouml:times:Oslash:Ugrave:Uacute:Ucirc:Uuml:Yacute:THORN:szlig:agrave:aacute:acirc:atilde:auml:aring:aelig:ccedil:egrave:eacute:ecirc:euml:igrave:iacute:icirc:iuml:eth:ntilde:ograve:oacute:ocirc:otilde:ouml:divide:oslash:ugrave:uacute:ucirc:uuml:yacute:thorn:yuml",tkn);for(i=1;i<split("US-ASCII:UTF-8: :\302\240:—:\342\200\224:­:\302\255:�:\357\277\275:​:\342\200\213:</BODY></HTML>:</body></html>",mt);i+=2){nt[gensub(/[\\\&;]/,"","g",mt[i])]=mt[i+(chs % 2)]};delete mt;shyv="\\1\\" nt["shy"] "\\2";if("AZ"!~argoval){if((argoval=="C")||(argoval=="D")){stt="P.d {text-align: justify;}"};if(argoval=="D"){stt="H2.b {color: #f00;} " stt};if(argoval=="L"){stt="P.l {line-height: 87%; margin-bottom: 0cm; margin-left: 0.6cm; text-align: justify; text-indent: -0.6cm;} P.t {font-weight: bold;}"};if(argoval=="S"){stt="P.s {line-height: 87%; margin-bottom: 0.03cm; margin-left: 0.4cm; text-align: justify; text-indent: -0.4cm;}"};if(argoval=="U"){stt="line-height: 150%; margin-left: 1cm; page-break-inside: avoid; text-align: left; text-indent: -1cm;";stt="P.c {line-height: 120%; margin-bottom: 0.7cm; margin-left: 2cm; margin-right: 1cm; text-align: justify; } P.f {" stt "} P.p {font-size: smaller; "stt "} P.r {font-weight: bold; line-height: 150%; margin-top: 1cm; page-break-after: avoid; text-align: left; } P.u {line-height: 150%; margin-bottom: 0.5cm; text-align: justify;}"};stt="<STYLE TYPE=\"text/css\"><!-- " stt " --></STYLE></HEAD><BODY>";if(chs==3){stt=tolower(stt)};if(argoval=="P"){stt="<link rel=\"stylesheet\" type=\"text/css\" media=\"screen, projection, print\" href=\"http://www.w3.org/Talks/Tools/Slidy/slidy.css\" /><script src=\"http://www.w3.org/Talks/Tools/Slidy/slidy.js\" charset=\"utf-8\" type=\"text/javascript\"></script></head><body>"};if(chs==3){printf "%s","<?xml version=\"1.0\" encoding=\"" nt["US-ASCII"] "\"?><\041DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"sv\"><head><title></title>" stt};if(chs!=3){printf "%s","<\041DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><HTML><HEAD><META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=" nt["US-ASCII"] "\"><TITLE></TITLE>" stt}}}{cla="";if(match($0,/^<[hH][1-6]>/)){stt="H" substr($0,3,1);gsub(/^<[hH][1-6]>|<\/[hH][1-6]>$/,"")}else{stt="P";if(argoval=="C"){cla="d"}};gsub(/[\a\b\v\n\r\t ]+/," ");gsub(/^ | $/,"");if((argoval=="U")&&(($0~/^\\/)||($0~/^:; /))){cla="p";flt="cp2htmla.awk";print |& flt;close(flt,"to");flt |& getline;close(flt);sub(/^\\?: *; */,"")};for(i=0;i<33;i++){gsub(sprintf("%c",(i+127)%128),nt["#xfffd"])};if($0=="[extrarad]"){$0=nt["nbsp"]};if(chs!=3){for(i=1;i<129;i++){gsub(sprintf("%c",i+127),"\\&" tkn[i] ";")}};if($0=="[sidbrytning]"){$0="\f"};if(argoval=="D"){if((length($0)<70)&&(substr($0,1,2)!="s.")&&((substr($0,1,11)!~"^\\[sida" nt["nbsp"])||(substr($0,length($0)-2,3)!="ar]"))){stt="H2";if(index($0,nt["nbsp"] nt["mdash"] nt["nbsp"])){cla="l"}};if(NR==1){stt="H1"};if(stt=="P"){cla="d"};};if(argoval=="L"){if(substr($0,1,8)~"^s." nt["nbsp"]){cla="l"}else{cla="b"}};if(argoval=="S"){cla="s"};if((argoval=="U")&&(cla=="")){cla="u";versalrad=$0;gsub(/&#?[[:alnum:]]+;/,"",versalrad);if((versalrad==toupper(versalrad))&&(versalrad~/[[:alpha:]][[:alpha:]]/)){cla="r"};if($0~/^(&[lr][ads]quo;|\302\253|\342\200\234|\342\200\230|\302\273|\342\200\235|\342\200\231)/){cla="c"};if(($0~/^[^ ]+[\(\[]/)||($0~/\014/)){$0=gensub("(^[^ ]+)" nt["nbsp"] "(and|och)" nt["nbsp"],"\\1 \\2 ","1");$0=gensub("^([^([]+\\.,)" nt["nbsp"],"\\1 ","g");cla="f"}else{$0=gensub(/([[:alnum:]]\/)([[:alpha:]])/,"\\1\\" nt["#x200b"] "\\2","g");$0=gensub(/(ations|iblioteks|lldhets)([[:alpha:]])/,shyv,"g");$0=gensub(/([b-df-hj-np-tv-xz]s)(system)/,shyv,"g")}};if("YZ"~argoval){printf "%s",$0}else{if(cla!=""){cla=" CLASS=\"" cla "\""};if(chs==3){cla=tolower(cla);stt=tolower(stt)};printf "%s","<" stt cla ">" $0 "</" stt ">"}} END{if("AZ"!~argoval){printf nt["</BODY></HTML>"]}} |
run-example | grep "something" filename.txt | cp2htmlz.awk |
deci2hex.awk (or if you like indents and dislike comments maybe deci2hex.awk.indents works) | |
purpose | converts decimal numbers prefixed with # to hexadecimal numbers preceeded by #x |
code | {soek="#[0-9]+";while(match($0,soek)){$0=substr($0,1,RSTART) "x" sprintf ("%x",substr($0,RSTART+1,RLENGTH-1)) substr($0,RSTART+RLENGTH)};print} |
run-example | echo "A" | deci2hex.awk |
keep4aht.awk (or if you like indents and dislike comments maybe keep4aht.awk.indents works) | |
purpose | converts the four HTML encoded ASCII characters in an HTML file so that files with these characters may be processed by keepital.awk later |
code | {gsub(/&/,"&");print gensub(/&(gt|lt|quot);/,"\\&\\1;","g")} |
run-example | cat filename.htm | mkparagr.awk | keep4aht.awk | keepital.awk > newfilename.htm |
keepital.awk (or if you like indents and dislike comments maybe keepital.awk.indents works) | |
purpose | adds italic and other marks so that an HTML file may be saved as text without loosing italics, etc. Notice that the biggest part of this filter fixes the position of some tags to skip inconsistent outputs caused by flawn input data. If you are only interested in keeping italics etc, only keep the last print-statement of the script and the definition of the variable it uses. See also keep4aht.awk. |
code | BEGIN{taggar="b i U strike";split(taggar,tag);taggar="";for(i in tag){tagg[tolower(tag[i])]=ntag=toupper(tag[i]);if(ntag~/^[bB]$/){tagg["strong"]=tagg["STRONG"]=ntag};if(ntag~/^[iI]$/){tagg["em"]=tagg["EM"]=ntag};taggar=taggar "|" ntag};taggar=substr(taggar,2)} {gsub(/[\f\t ]+/," ");for(i in tagg){$0=gensub("(</?)" i ">","\\1" tagg[i] ">","g")};mtag="<SPAN LANG=\"[a-z][a-z]-[A-Z][A-Z]\">| | ";split(taggar,tag,"|");for(i=1;i<=asort(tag);i++){if(i==1){gsub(/( )* +( )*/," ")};if(($0!=($0=gensub("<" tag[i] ">((" mtag ")?)</" tag[i] ">","\\1","g")))||($0!=($0=gensub("</" tag[i] ">((" mtag ")?)<" tag[i] ">","\\1","g")))||($0!=($0=gensub("<(" taggar ")>(" mtag ")","\\2<\\1>","g")))||($0!=($0=gensub("(" mtag ")</(" taggar ")>","</\\2>\\1","g")))||gsub(" </" tag[i] ">","</" tag[i] "> ")||gsub("<" tag[i] "> "," <" tag[i] ">")){i=0}};print gensub("<(/?(" taggar "))\\y","\\\046lt;\\1\\\046gt;&","g",$0)} |
run-example | cat filename.htm | mkparagr.awk | keep4aht.awk | keepital.awk > newfilename.htm |
lineshtm.awk (or if you like indents and dislike comments maybe lineshtm.awk.indents works) | |
purpose | transforms a one-line HTML file to several lines according to the HTML formating in the file |
code | BEGIN{IGNORECASE=1} {gsub(/</,"\n<");gsub(/\n<\//,"</");sub(/\n</,"<");gsub(/\n<B>/,"<B>");gsub(/\n<EM>/,"<EM>");gsub(/\n<I>/,"<I>");gsub(/\n<STRIKE>/,"<STRIKE>");gsub(/\n<TT>/,"<TT>");gsub(/\n<U>/,"<U>");gsub(/\n<HREF/,"<HREF");print} |
run-example | cat filename.htm | lineshtm > newfilename.htm |
mimeprep.awk (or if you like indents and dislike comments maybe mimeprep.awk.indents works) | |
purpose | replaces quoted-printable code with 8-bits text |
code | BEGIN{printf "%s", "=?iso-8859-1?q?"}{if($0~/=$/){sub(/=$/,"")}else{$0=$0 "?=\n=?iso-8859-1?q?"};printf "%s",$0}END{print"?="} |
run-example | grep "ngt" filename.txt -A 62 | mimeprep.awk | mimeqpde.awk > newfile.txt |
mimeqpde.awk (or if you like indents and dislike comments maybe mimeqpde.awk.indents works) | |
purpose | replaces quoted-printable code with 8-bits text |
code | BEGIN{IGNORECASE=1; hexasiffra="0123456789abcdef";inledning="=?iso-8859-1?q?";inledlaengd=length(inledning)} {gsub(/=\?windows-1252\?q\?/,inledning);while(boerjan=index($0,inledning)){if(laengd=inledlaengd+index(substr($0,boerjan+inledlaengd),"?=")){snutt=substr($0,boerjan+inledlaengd,laengd-1-inledlaengd);gsub("_"," ",snutt);plats=index(snutt "=","=");while(plats<length(snutt)-1){if(fsiff=index(hexasiffra,substr(snutt,plats+1,1))){asiff=index(hexasiffra,substr(snutt,plats+2,1));if(asiff>0){snutt=substr(snutt,1,plats-1) sprintf("%c",fsiff*16+asiff-17) substr(snutt,plats+3)}};plats+=index(substr(snutt "=",plats+1),"=")}}$0=substr($0,1,boerjan-1) snutt substr($0,laengd+boerjan+1)};print} |
run-example | cat filename.txt | mimeqpde.awk > newfile.txt |
mkampxml.awk (or if you like indents and dislike comments maybe mkampxml.awk.indents works) | |
purpose | Replaces some occurrences of the greater-than, lower-than and ampersand signs with their xml entity name, respectively. See code for details. |
code | {$0=gensub(/< /,"\\< ","g");$0=gensub(/ & /," \\& ","g");print gensub(/(([^-]-)| )>/,"\\1\\>","g");} |
run-example | echo '_ -> # > # < # & # --> _' | mkampxml.awk |
mkdrafts.awk (or if you like indents and dislike comments maybe mkdrafts.awk.indents works) | |
purpose | filter to use before cp2htmlu.awk to create drafts. Compare with mkutkast.awk |
code | {if(($0!="[extrarad]")&&($0!~"^#pgg# ")&&($0!="[sidbrytning]")){print}} |
run-example | cat filename.txt | mkdrafts.awk | cp2htmlu.awk |
mkfldtsf.awk (or if you like indents and dislike comments maybe mkfldtsf.awk.indents works) | |
purpose | prepares fld to tso |
code | {steg=300;omlott=100;sub(/^##kro /,"");if((length($0)>21)&&(index($0,"-")==16)){for(i=substr($0,10,6)+omlott;i<substr($0,17,6)-1;i+=steg){if(i % 10000 >5959){i+=4000-steg}else{printf "##kro " substr($0,1,9) "%06i fld \n",i}}}} |
run-example | grep "20070130i131500-150000" filename.txt | mkfldtsf.awk |
mkfldutf.awk (or if you like indents and dislike comments maybe mkfldutf.awk.indents works) | |
purpose | prepares to print fld |
code | BEGIN{kroanf="##kro";ejfrad=0} {if((ejfrad==0)&&($0!~/(inst(\344|\303\244)ll[td])|( uppdelat )|( ejgj )/)&&($1==kroanf)&&(length($2)==22)){ejfrad=1;$1="";$3="";tidigast=substr($2,1,15);senast=substr($2,1,9) substr($2,17);$0=$0;sub(/^ /,"");print "<B>" $0 "</B>"};if((ejfrad==1)&&($1==kroanf)&&(length($2)==15)&&($2>=tidigast)&&($2<=senast)&&($3=="fld")){$1="";$2="";$3="";$0=$0;print}} |
run-example | grep "20070130i131500-150000" filename.txt -A 99 | mkfldutf.awk | xtraline.awk | cp2htmlc.awk |
mkfnotef.awk (or if you like indents and dislike comments maybe mkfnotef.awk.indents works) | |
purpose | moves foot notes from the bottom of a document to their place inside the text body in a document saved as HTML by OpenOffice. If the file has endnotes instead of footnotes, you may try to filter first with (but sometimes, it doesn't help!): awk '{gsub(/endnote/,"footnote");gsub(/<\/?FONT[^>]*>/,"");print}' |
code | BEGIN{fotnot="sann";ejfaerdig="sann";notblockboerjan="<DIV ID=\"sdfootnote"} {if((fotnot!="")&&(substr($0,1,length(notblockboerjan))==notblockboerjan)){while((notnrplats=index($0,"anc\">"))==0){getline};notnrplats=substr($0,notnrplats+5);notnummer=substr(notnrplats,1,index(notnrplats,"<")-1);noten[notnummer]=substr(notnrplats,index(notnrplats ">",">")+1);getline;while($0!="</DIV>"){gsub(/\t/," ");noten[notnummer]=noten[notnummer] substr($0,(noten[notnummer]=="")?2:1);getline};noten[notnummer]=substr(noten[notnummer],1,length(noten[notnummer])-((noten[notnummer]~"</SPAN></P>$")?11:4));gsub("\r"," ",noten[notnummer]);gsub("\002"," ",noten[notnummer]);gsub(/<\/?[pP]( [^>]*)?>/," ",noten[notnummer]);gsub(/ +$/,"",noten[notnummer]);gsub(/^ +/,"",noten[notnummer]);gsub(/^ /,"",noten[notnummer]);gsub(/<BR>/,"",noten[notnummer])}else{if(fotnot){while(substr($0,1,21)!="<!DOCTYPE HTML PUBLIC"){getline}};fotnot="";while(plats=match($0,/(<SUP>)?(<SPAN[^>]*>)?(<FONT[^>]*>)?<A CLASS=\"sdfootnoteanc\" NAME=\"sdfootnote/)) {notnummer=0+substr($0,plats+RLENGTH);match(substr($0 " ",plats) " ",/<\/SUP><\/A>(<\/FONT>)?(<\/SPAN>)?(<\/SUP>)?/);$0=substr($0,1,plats-1) " [not " notnummer ": " noten[notnummer] "]" substr($0 " ",plats+RSTART+RLENGTH-1)};gsub(/ +/," ");if((ejfaerdig)&&(substr($0,1,length(notblockboerjan))==notblockboerjan)){print "</BODY></HTML>";ejfaerdig=""}else{if(ejfaerdig){print}}}} |
run-example | (grep '<DIV ID=\"sdfootnote' documentConvertedToHtmlWithOpenOffice.htm -A 99999; cat documentConvertedToHtmlWithOpenOffice.htm) | mkfnotef.awk > documentWithFootNotesInsideTheText.htm |
mkgpsild.awk (or if you like indents and dislike comments maybe mkgpsild.awk.indents works) | |
purpose | Input is a first line with what should be focused on and remaining lines have coordinates in their sixth and seventh fields. Output is the command to make a map from gps coordinates, which lets you modify the command before it is executed. |
code | BEGIN{marginal=99;skala=1;vinkel=0} {if(NR==1){fokus=$1}else{if(($3=="gps")&&($8!="")){if(koo==""){koo=$4;xmax=xmin=$6;ymax=ymin=$7};if($4==koo){i++;punktx[i]=$6;punkty[i]=$7;punktf[i]=($0~fokus);punktt[i]=gensub(/ .*/,"","1",substr($0,56));if($6>xmax){xmax=$6};if($7>ymax){ymax=$7};if($6<xmin){xmin=$6};if($7<ymin){ymin=$7}}}}} END{printf "convert -size %ix%i xc:transparent ",skala*(xmax-xmin+1+2*marginal),skala*(ymax-ymin+1+2*marginal);for(i in punktx){platsx=skala*(punktx[i]-xmin+marginal);platsy=skala*(ymax-punkty[i]+marginal);printf " -fill " ((punktf[i])?"red":"black") " -draw \047point %i,%i\047",platsx,platsy;if(fokus=="text"){printf " -annotate %ix%i+%i+%i \047%s\047",vinkel,vinkel,platsx,platsy,punktt[i]}};print " -trim +repage gps" koo fokus strftime("%s") ".png"} |
run-example | grep "something" | mkgpsild.awk |
mkkroutf.awk (or if you like indents and dislike comments maybe mkkroutf.awk.indents works) | |
purpose | prepares to print kro |
code | BEGIN{kroanf="##kro"} {if(($1==kroanf)&&($0!~/inst(\344|\303\244)ll[td]/)){sub(/00-/,"-",$2);sub(/00-/,"-",$2);sub(/00$/,"",$2);sub(/00$/,"",$2);sub(/\.\.-/,"-",$2);sub(/\.\.-/,"-",$2);sub(/\.\.-/,"-",$2);sub(/\.\.$/,"",$2);sub(/\.\.$/,"",$2);sub(/\.\.$/,"",$2);sub(/\?\?-/,"-",$2);sub(/\?\?-/,"-",$2);sub(/\?\?-/,"-",$2);sub(/\?\?$/,"",$2);sub(/\?\?$/,"",$2);sub(/\?\?$/,"",$2);if($3=="fld"){$0=substr($0,25)}else{if(($4!="")&&(length($3)==3)){$3=""};sub(/sal:/,"",$4);$0=substr($0,(substr($0,11,1)=="0")?12:11)};gsub(" +"," ");if(length($0)>2){print}}} |
run-example | grep "something" filename.txt | mkkroutf |
mkliacif.awk (or if you like indents and dislike comments maybe mkliacif.awk.indents works) | |
purpose | converts lia to line-quotations |
code | BEGIN{si="-9xvil";aat="[1-2][0-9][0-9][0-9][ab]?";nbs="\240";raq="\273";if(length(sprintf("%c%c%c%c",0,16,216,128))==3){nbs="\302" nbs;raq="\302" raq}}{if($1=="##lia"){if($3==",,,"){gsub(/ (((\227|\342\200\224) )|\(|\[).*/,"");foer=$NF nbs};if(($4==",,,")&&(foer!="")){et=($0~/\(tillsammans med /)?"et" nbs "al." nbs:"";aar=gensub(".* \\[(" aat ")\\].*","\\1","1");if(aar==$0){aar=gensub(".* \\((SOU" nbs ")?(" aat ")(:[1-9][0-9]*)?\\).*","\\2","1")};if((aar==$0)&&(foer=="Prop." nbs)){aar=gensub(".*( |" nbs ")(" aat "/(" aat "|[0-9][0-9]):[1-9][0-9]*) .*","\\2","1")};aar=et ((aar==$0)?":":aar ", ")}};if($4=="0"){$4=$5="";$6=$6 " 0";$0=$0};if($0!~/,,,/){sid=gensub("0*([1" si "][0" si "]*-?)(0*([1" si "][0" si "]*))?","\\1\\3","g",$4);if(sid==9999){sid="utan nummer"};$1=$2=$3=$4=$5="";sub(/^ +/,"");print raq $0 raq " (" foer aar "s." nbs sid ")"}} |
run-example | leta 'butju' lia ngt | mkliacif.awk | xtraline.awk > outfile.txt |
mkliatso.awk (or if you like indents and dislike comments maybe mkliatso.awk.indents works) | |
purpose | builts lia template from stdin |
code | BEGIN{anf="##lia ";sta=3} function finx(r){ler=((r==9999)||(r==0))?2:1;for(j=1;j<10^ler;j++){printf "%s%s %s %0" len "i %0" ler "i \n",anf,$1,$2,r,j}}{len=length($3);for(i=4;i<=NF;i++){if((length($i)>len)&&($i!=9999)){len=length($i)}};if((NF>2)&&($1~/^[[:alnum:]][[:alnum:]][[:alnum:]][[:alnum:]][[:alnum:]]$/)&&($2~/^[[:alnum:]]+/)&&(($3=="0")||($3!=0))){print anf $1 " ,,, \n" anf $1 " " $2 " ,,, ";if((($4>0)&&($3<=$4))){sta=5;for(i=$3;i<=$4;i++){if(i!=$3){printf "%s%s %s %0" len "i-%0" len "i 1 \n",anf,$1,$2,i-1,i};finx(i)}};for(i=sta;i<=NF;i++){finx($i)};print "ksm ; checklia " $1 " " $2 " ; sparatid tsorall.awk"}} |
run-example | echo "foerf tacc 52 55 150 102 9999 0" | mkliatso.awk | env LC_COLLATE=POSIX sort | uniq |
mkliautf.awk (or if you like indents and dislike comments maybe mkliautf.awk.indents works) | |
purpose | prepares to print lia |
code | BEGIN{liaanf="##lia";nbs="\240";if(length(sprintf("%c%c%c%c",0,16,216,128))==3){nbs="\302" nbs}} {if($1==liaanf){gsub(/\r/,"");if(($4~"^[[:digit:]]+-?[[:digit:]]")&&($3!=",,,")){$5="";while($4~/-0/){gsub(/-0/,"-",$4)}};if($3==",,,"){et=($0~/\(tillsammans med /)?"et" nbs "al.:":":";$0=substr($0,17);foerf=gensub(/( (((\227|\342\200\224) )|\(|\[).*)?$/,":","1");sub(/\.:/,".",foerf);$0=""};if($4==",,,"){$1="";$2="";$3="";$4=""};if($1==liaanf){$1=(($2=="intme")?"r":"s") "." nbs;$2=$3=$5=""};$0=gensub("^([rsv]\\." nbs ")[ 0]+","\\1","1",$0);gsub(/ +/," ");radboerjan=(substr($0,1,1)==" ")?((NR>2)?"\n\n":"") foerf:"\n";$0=gensub("^([rsv]\\." nbs ")9999 ","\\1? ","1",$0);if($0~"^[rsv]\\." nbs){sub(/ /,nbs)};if(($2!="")&&(($2!="[ej")||($3!="antecknat]"))){printf(radboerjan "%s",$0)}}} |
run-example | grep "something" filename.txt | mkliautf |
mkparagr.awk (or if you like indents and dislike comments maybe mkparagr.awk.indents works) | |
purpose | suppresses line-breaks except for empty lines in a text file. Compare with xtraline.awk |
code | BEGIN{FS="\n";RS="";ejfrad=0}{gsub (/\n/," ");if(ejfrad){print ""};printf "%s", $0;ejfrad=1} |
run-example | cat filename.txt | mkparagr.awk > newfilename.txt |
mkprglis.awk (or if you like indents and dislike comments maybe mkprglis.awk.indents works) | |
purpose | Converts perl, C, java or python code to dbk:article to be easily included in docbook documents with xi:include |
code | BEGIN{prgm=1;pl="perl";comm="#[^\0413].*";calo=" *#([-a-zA-Z]+)[^<]*"}{sub(/[\t\r\f ]+$/,"");if(NR==1){if($0~/^(\/\*|\043include )/){pl="C";comm="^(/\\*.*| \\*.*)$|//.*";calo="TODO not yet defined"}else{if($0~/^(\/\/|import )/){pl="java";comm="^(/\\*.*| \\*.*)$|//.*";calo="TODO not yet defined"}else{if($0~/python/){pl="python";calo = "TODO not yet defined"}}};printf "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<dbk:article xmlns:dbk=\"http://docbook.org/ns/docbook\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://docbook.org/ns/docbook docbook5_docbook.xsd\"><dbk:programlisting language=\"" pl "\" xml:id=\"" pl length($0) "\" xml:space=\"preserve\">"};if(prgm){if($0~/^__(END|DATA)__$/){prgm=0}else{gsub(/&/,"&");gsub(/</,"\\<");gsub(/>/,"\\>");gsub(/\"/,"\\"");gsub(/\047/,"\\&\04339;");$0=gensub(comm,"<dbk:lineannotation>&</dbk:lineannotation>","1");$0=gensub(calo,"& <dbk:co xml:id=\"\\1-co\" linkends=\"\\1\" />","1");printf "%s%3i %s",((NR>1)?"\n":""),NR,$0}}}END{print "</dbk:programlisting></dbk:article>"} |
run-example | mkprglis.awk myProg.pl > myProg_pl.xml |
mkrauhtm.awk (or if you like indents and dislike comments maybe mkrauhtm.awk.indents works) | |
purpose | builts htm-files for rara |
code | BEGIN{svp="/home/s/saasha/public_html/htmsnutt/";RS="";bredd=400;alte="rarasida";htmsnutt=svp "htmlanfa.txt";getline webbfilanf < htmsnutt;close (htmsnutt);htmsnutt=svp "htmltype.txt";getline webbfil < htmsnutt;close (htmsnutt);webbfilanf=webbfilanf webbfil "<TITLE>En alternativ RARA-sida</TITLE></HEAD><BODY><H1 ALIGN=\"center\">En alternativ RARA-sida</H1><TABLE ALIGN=\"center\" BORDER=\"0\" CELLPADDING=\"0\" CELLSPACING=\"0\"><TR><TD COLSPAN=\"2\" ALIGN=\"center\">Boken visas uppslagsvis</TD></TR><TR><TD ALIGN=\"right\">";htmsnutt=svp "htmlfoot.txt";getline webbfilsl < htmsnutt;close (htmsnutt);webbfilsl="</TD></TR></TABLE>" webbfilsl "</BODY></HTML>"}{for(sn=1;sn<NF+1;sn+=2){if(sn!=1){si[sn]="<A HREF=\"" gensub(/\.(png|jpg)/,".htm","1",$(sn-((sn==2)?1:2))) "\">föregående uppslag</A> —"};si[sn]=si[sn] "</TD><TD ALIGN=\"left\">";if((sn==1)&&(NF>1)){si[sn]=si[sn] "— <A HREF=\"" gensub(/\.(png|jpg)/,".htm","1",$2) "\">nästa uppslag</A>"}else{if($(sn+2)!=""){si[sn]=si[sn] "— <A HREF=\"" gensub(/\.(png|jpg)/,".htm","1",$(sn+2)) "\">nästa uppslag</A>"}};si[sn]=si[sn] "</TD></TR><TR><TD WIDTH=\"" bredd "\">";if(sn>1){si[sn]=si[sn] "<IMG ALIGN=\"right\" SRC=\"" $sn "\" ALT=\"" alte "\">"};si[sn]=si[sn] "</TD><TD WIDTH=\"" bredd "\">";if(sn==1){si[sn]=si[sn] "<IMG ALIGN=\"left\" SRC=\"" $1 "\" ALT=\"" alte "\">"}else{if(sn<NF){si[sn]=si[sn] "<IMG ALIGN=\"left\" SRC=\"" $(sn+1) "\" ALT=\"" alte "\">"}};hf=gensub(/\.(png|jpg)/,".htm","1",$sn);filut=webbfilanf si[sn] webbfilsl;print "echo \"" gensub(/\"/,"\\\\\"","g",filut) "\" > " hf;if(sn==1){print "\\ln -s " hf " index.htm"};if(sn==1){sn=0}}} |
run-example | ls *.jpg | mkrauhtm.awk | sh |
mkravhtm.awk (or if you like indents and dislike comments maybe mkravhtm.awk.indents works) | |
purpose | an improvement of mkrauhtm.awk |
code | BEGIN{svp="/home/s/saasha/public_html/htmsnutt/";RS="";bredd=400;alte="rarasida";lsv="/~saasha/cgi-bin/acc_download.cgi/rara/";htmsnutt=svp "htmlanfa.txt";getline webbfilanf < htmsnutt;close (htmsnutt);htmsnutt=svp "htmltype.txt";getline webbfil < htmsnutt;close (htmsnutt);webbfilanf=webbfilanf webbfil "<TITLE>En alternativ RARA-sida</TITLE></HEAD><BODY><H1 ALIGN=\"CENTER\">En alternativ RARA-sida</H1><TABLE ALIGN=\"CENTER\" BORDER=\"0\" CELLPADDING=\"0\" CELLSPACING=\"0\"><TR><TD COLSPAN=\"2\" ALIGN=\"CENTER\">Boken visas uppslagsvis</TD></TR><TR><TD ALIGN=\"RIGHT\">";htmsnutt=svp "htmlfoot.txt";getline webbfilsl < htmsnutt;close (htmsnutt);webbfilsl="</TD></TR></TABLE>" webbfilsl "</BODY></HTML>"} function finx(nro){gensub(/.htm/,nro ".htm","1",hf)} function umf(filutt,finr){print "echo \"" gensub(/\"/,"\\\\\"","g",filutt) "\" > " gensub(/.htm/,finr ".htm","1",hf)} {for(sn=1;sn<NF+1;sn+=2){if(sn!=1){si[sn]="<A HREF=\"" lsv gensub(/\.(png|jpg)/,".htm","1",$(sn-((sn==2)?1:2))) "\">föregående uppslag</A> —"};si[sn]=si[sn] "</TD><TD ALIGN=\"LEFT\">";if((sn==1)&&(NF>1)){si[sn]=si[sn] "— <A HREF=\"" lsv gensub(/\.(png|jpg)/,".htm","1",$2) "\">nästa uppslag</A>"}else{if($(sn+2)!=""){si[sn]=si[sn] "— <A HREF=\"" lsv gensub(/\.(png|jpg)/,".htm","1",$(sn+2)) "\">nästa uppslag</A>"}};hf=gensub(/\.(png|jpg)/,".htm","1",$sn);haeh="identify "$sn;haeh | getline hoejd;close(haeh);hoejd=substr(hoejd,index(hoejd," ")+1);hoejd=substr(hoejd,index(hoejd,"x")+1);hoejd=substr(hoejd,1,index(hoejd," ")-1);si[sn]=si[sn] "</TD></TR><TR HEIGHT=\"" hoejd "\"><TD WIDTH=\"" bredd "\">";if(sn>1){si[sn]=si[sn] "<IMG ALIGN=\"RIGHT\" SRC=\"" $sn "\" ALT=\"" alte "\">"};si[sn]=si[sn] "</TD><TD WIDTH=\"" bredd "\">";if(sn==1){si[sn]=si[sn] "<IMG ALIGN=\"LEFT\" SRC=\"" $1 "\" ALT=\"" alte "\">"}else{if(sn<NF){si[sn]=si[sn] "<IMG ALIGN=\"LEFT\" SRC=\"" $(sn+1) "\" ALT=\"" alte "\">"}};filut=webbfilanf si[sn] ((sn==NF)?"":("</TD></TR><TR><TD COLSPAN=\"2\"><P ALIGN=\"CENTER\"><A HREF=\"" lsv gensub(/.htm/,"h9.htm","1",hf) "\">Vänd sidan för att bläddra i boken</A></P>")) webbfilsl;umf(filut,"");if(sn==1){print "\\ln -s " hf " index.htm"};if(sn!=111){bkg=((sn>1)?$(sn-2):"");sub(/TD WIDTH=/,"TD BACKGROUND=\"" bkg "\" WIDTH=",filut);sub(/TD WIDTH=/,"TD BACKGROUND=\"" $(sn+((sn==1)?2:3)) "\" WIDTH=",filut);sub("TD BACKGROUND=\042" $0 "\042 WIDTH=","TD WIDTH=",filut);sub(/TD BACKGROUND=\"\" WIDTH=/,"TD WIDTH=",filut);gsub(/HEAD><META/,"HEAD><META http-equiv=\"refresh\" content=\"1; url=\"><META",filut);steg=3;for(vink=9;vink>-1;vink-=steg){if(sn!=44){if(vink>0){umf(gensub("IMG ALIGN=\042LEFT","IMG WIDTH=\"" vink "0%\" HEIGHT=\"" hoejd "\" ALIGN=\042LEFT","1",gensub(/1; url=/,"1; url=" gensub(/.htm/,"h" (vink-3) ".htm","1",hf) finx("h" vink-3),"1",filut)),"h" vink)}else{umf(gensub("IMG ALIGN=\042LEFT","IMG WIDTH=\"" vink "0%\" HEIGHT=\"" hoejd "\" ALIGN=\042LEFT","1",gensub(/1; url=/,"1; url=" gensub(/.htm/,"v" steg ".htm","1",gensub(/\.(png|jpg)/,".htm","1",$(sn+((sn==1)?1:2)) )),"1",filut)),"h" vink)}}};for(vink=9;vink>0;vink-=steg){if(sn!=66){if(vink<9){umf(gensub("IMG ALIGN=\042RIGHT","IMG WIDTH=\"" vink "0%\" HEIGHT=\"" hoejd "\" ALIGN=\042RIGHT","1",gensub(/1; url=/,"1; url=" gensub(/.htm/,"v" (vink+3) ".htm","1",hf) finx("v" vink+3),"1",filut)),"v" vink)}else{umf(gensub("IMG ALIGN=\042RIGHT","IMG WIDTH=\"" vink "0%\" HEIGHT=\"" hoejd "\" ALIGN=\042RIGHT","1",gensub(/1; url=/,"1; url=" gensub(/\.(png|jpg)/,".htm","1",$sn),"1",filut)),"v" vink)}}};};if(sn==1){sn=0}}} |
run-example | ls *.png | mkravhtm.awk | sh |
mkutkast.awk (or if you like indents and dislike comments maybe mkutkast.awk.indents works) | |
purpose | filter to use before cp2htmlu.awk to create anti-drafts. Compare with mkdrafts.awk |
code | {if(($0!="[extrarad]")&&($0!="[sidbrytning]")){print}} |
run-example | grep "^#pgg# " -C 3 filename.txt | mkutkast.awk | cp2htmlu.awk |
ppmshort.awk (or if you like indents and dislike comments maybe ppmshort.awk.indents works) | |
purpose | shortens .ppm-files to one line without comments |
code | {if($0!~/^#/){rad=rad $0 " "}}END{$0=rad;gsub(/ +/," ");gsub(/^ | $/,"");print} |
run-example | convert picturename -compress none ppm:- | ppmshort.awk |
renpajek.awk (or if you like indents and dislike comments maybe renpajek.awk.indents works) | |
purpose | cleans pajek SVG |
code | {gsub(/\r/,"");gsub(/<g transform="scale\(1\)">/,"<g>");if(NR==1){$0=$0 "<\041DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\" >"};if((NR>1)&&(NR<5)&&($1=="<\041--")){$0=""};if(NR==5){$0=gensub(/^(<svg )xml:space="preserve(" width="([0-9]+)" height="([0-9]+)" )(xmlns="http:\/\/www.w3.org\/2000\/svg" )(xmlns:xlink="http:\/\/www.w3.org\/1999\/xlink">)$/,"\\1\\5viewBox=\"0 0 \\3 \\4\" \\6<rect x=\"1\" y=\"1\\2fill=\"white\"/>","1")};if((NR>6)&&(NR<54)){$0=""};if($0~/^<ellipse cx="/){sub(/fill:rgb\(255,255,255/,"fill:rgb(255,0,0")};if($0!=""){print}} |
rensaocr.awk (or if you like indents and dislike comments maybe rensaocr.awk.indents works) | |
purpose | cleans OCR output |
code | BEGIN{ant=split("otke~othe~tke~the",tkn2,"~")/2;for(i=1;i<=ant;i++){tkna[tkn2[2*i-1]]=tkn2[2*i]};ant=split("fFs~ffs",tkn2,"~")/2;for(i=1;i<=ant;i++){tknm[tkn2[2*i-1]]=tkn2[2*i]};ant=split("contray~contrary~galley~gallery~geomety~geometry~libray~library~Libray~Library~Mew~New~narure~nature~novely~novelty~nrom~from~ofa~of a~theoy~theory~situng~sitting~",tkn2,"~")/2;for(i=1;i<=ant;i++){tkno[tkn2[2*i-1]]=tkn2[2*i]};ant=split("ajve~ative~aliy~ality~cUon~ction~guiy~guity~iliy~ility~iviy~ivity~nuiy~nuity",tkn2,"~")/2;for(i=1;i<=ant;i++){tkns[tkn2[2*i-1]]=tkn2[2*i]};utrad=""}{for(i in tkna){$0=gensub("\\y" i,tkna[i],"g")};for(i in tknm){$0=gensub(i,tknm[i],"g")};for(i in tkno){$0=gensub("\\y" i "\\y",tkno[i],"g")};for(i in tkns){$0=gensub(i "\\y",tkns[i],"g")};utrad=utrad $0 "\n"}END{printf "%s", utrad} |
run-example | gocr -f ISO8859_1 -a 80 -i filename.png | rensaocr.awk |
rensasvg.awk (or if you like indents and dislike comments maybe rensasvg.awk.indents works) | |
purpose | drastically cleans SVG |
code | {if(NR==1){print};if(f){gsub(/^.*--.*$|scale\(1 1\) rotate\(0\) | (id|class)="[^"]+"|<title>[^<]+<\/title>|\r/,"");printf "%s",gensub(/\.[[:digit:]]+([" ,A-Za-z()])/,"\\1","g",$0)};if($0~/^<svg /){sub(/ width="[^"]*" height="[^"]*"/,"");printf "%s",$0;f=1}} |
rensaxon.awk (or if you like indents and dislike comments maybe rensaxon.awk.indents works) | |
purpose | cleans input xhtml-files to be able to be transformed with saxon using xsl-transformations (xslt) |
code | {gsub(/\r|<\041DOCTYPE [^>]*>| xmlns="http:\/\/www.w3.org\/1999\/xhtml"/,"");if($0!=""){print}} |
scal2sur.awk (or if you like indents and dislike comments maybe scal2sur.awk.indents works) | |
purpose | Replaces hexadecimal scalars in XML-encoded code points (over U+FFFF) to decimal XML-encoded surrogate pairs, which is useful to built input to UTF-16 based software |
code | {while(match($0,/&#[xX]0?[^0]0?[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:xdigit:]];/)){sca=strtonum("0" substr($0,RSTART+2,RLENGTH-3));$0=substr($0,1,RSTART+1) (int(sca/1024)+55232) ";&#" (56320+(sca%1024)) ";" substr($0,RSTART+RLENGTH)};print} |
run-example | echo "m_nn_m" | scal2sur.awk |
su2swlu8.awk (or if you like indents and dislike comments maybe su2swlu8.awk.indents works) | |
purpose | Converts a field-separated file containing fields with swedish sign language (iso_639-3 language code swl) text saved using the old (ASCII-based) SU fonts to a plain field-less text file. The default field separator is the at-sign (@), but this can easily be changed at the begining of the filter (variable fsep). In each input line, the fields are assumed to be grouped by three. The first field of each field group is assumed to be non-swl text and is output as is (except for sequences of three field separator characters, which are outputted as one plain occurrence of this character), the second field is assumed to be SU-encoded with SU's first font and the second field with SU's second font. In a UTF-8 environment, the input is assumed to be UTF-8 and the output is plain UTF-8, using the PUA for swl text. In a 8-bit environment, the input is assumed to be ISO_8859-1 or CP-1252 encoded and the output has the same encoding as the input, where swl characters are encoded as XML encoded characcter references in the PUA. A sans-serif font called swl_sans.ttf is a PUA font for swl available at http://www.acc.umu.se/~saasha/typsnitt/#swl_sans |
code | BEGIN{fsep="@";if((chs=length(sprintf("%c%c%c%c",0,16,216,128)))==3){c[1,sprintf("%c%c",194,184)]=c[2,sprintf("%c%c",195,165)]=sprintf("%c%c%c%c",244,140,165,159);c[1,sprintf("%c%c",194,163)]=sprintf("%c%c%c%c",244,140,165,158);c[2,sprintf("%c%c",194,163)]=sprintf("%c%c%c%c",244,140,166,145);}else{c[1,sprintf("%c",184)]=c[2,sprintf("%c",229)]="􌥟";c[1,sprintf("%c",163)]="􌥞";c[2,sprintf("%c",163)]="􌦑"};b=split(sprintf ("%c",33) "@10c900@10c960@&@10c905@10c964@#@10c902@10c961@$@10c903@10c962@%@10c904@10c963@)@10c906@10c91f@*@10c907@10c965@+@10c908@10c966@,@10c909@10c967@-@10c90a@10c968@.@10c90b@10c969@/@10c90c@10c96a@0@10c911@10c96b@1@10c912@10c96c@2@10c913@10c96d@3@10c914@10c96e@4@10c915@10c96f@5@10c916@10c970@6@10c917@10c971@7@10c918@10c972@8@10c919@10c973@9@10c91a@10c974@:@10c91b@10c975@;@10c91c@10c976@=@10c91f@@?@10c921@10c979@A@10c922@10c97a@B@10c923@10c97b@C@10c924@10c97c@D@10c925@10c97d@E@10c926@10c97e@F@10c927@10c97f@G@10c928@10c980@H@10c929@10c982@I@10c92a@10c942@J@10c92b@10c923@K@10c92c@10c924@L@10c92d@10c946@>@10c920@10c978@M@10c92e@10c947@N@10c92f@10c927@O@10c930@10c949@P@10c931@10c94a@Q@10c932@10c94b@R@10c933@10c92b@S@10c934@10c94c@T@10c935@10c92e@U@10c936@10c93d@V@10c937@10c92f@W@10c938@10c931@X@10c939@10c952@Y@10c93a@10c95f@<@10c91e@10c977@Z@10c93b@10c933@[@10c93c@10c983@]@10c93e@10c984@_@10c940@10c985@a@10c942@10c986@b@10c943@10c987@c@10c944@10c988@d@10c945@10c989@e@10c946@10c98a@f@10c947@10c98b@g@10c948@10c98c@h@10c949@10c922@i@10c94a@10c943@j@10c94b@10c944@k@10c94c@10c945@l@10c90e@10c925@m@10c94e@10c926@n@10c93d@10c948@o@10c93f@10c928@p@10c951@10c929@q@10c952@10c92a@r@10c953@10c92c@s@10c954@10c90e@t@10c955@10c92d@u@10c956@10c94e@v@10c957@10c93f@w@10c958@10c930@x@10c959@10c951@y@10c95a@10c932@z@10c95b@@~@10c95c@10c98e",a,"@");for(i=1;i<b;i+=3){for(j=1;j<3;j++){c[j,a[i]]=((chs==3)?(sprintf("%c%c%c%c",244,140,128+int((strtonum("0x" a[i+j]) % 4096)/64),(strtonum("0x" a[i+j]) % 64)+128)):("&#x" a[i+j] ";"))}}}{gsub(/\r/,"");for(i=1;i<=length($0);i++){m=substr($0,i,1);if(m==fsep){j++;j%=3;k++;if((k%=3)==0){n=n fsep}}else{k=0;n=n ((c[j,m]=="")?(m):(c[j,m]))}}}END{print n} |
run-example | echo "bla@b¸£@l@@@Yå£@bla@@@bla" | su2swlu8.awk |
text2bra.awk (or if you like indents and dislike comments maybe text2bra.awk.indents works) | |
purpose | a text2braille filter converting a text file to a UTF-8 encoded braille unicode character file according to the swedish braille standard. If the environment is not UTF-8, then the input file is assumed to be a CP-1252 file and therefore, it works with ISO_8859-1 files too. |
code | BEGIN{ant=split("\n~\n~ ~\342\240\200~!~\342\240\226~%~\342\240\217\342\240\211~(~\342\240\246~)~\342\240\264~*~\342\240\224~,~\342\240\202~-~\342\240\244~.~\342\240\204~/~\342\240\214~0~\342\240\274~1~\342\240\241~2~\342\240\243~3~\342\240\251~4~\342\240\271~5~\342\240\261~6~\342\240\253~7~\342\240\273~8~\342\240\263~9~\342\240\252~:~\342\240\222~;~\342\240\206~?~\342\240\242~A~\342\241\201~B~\342\241\203~C~\342\241\211~D~\342\241\231~E~\342\241\221~F~\342\241\213~G~\342\241\233~H~\342\241\223~I~\342\241\212~J~\342\241\232~K~\342\241\205~L~\342\241\207~M~\342\241\215~N~\342\241\235~O~\342\241\225~P~\342\241\217~Q~\342\241\237~R~\342\241\227~S~\342\241\216~T~\342\241\236~U~\342\241\245~V~\342\241\247~W~\342\241\272~X~\342\241\255~Y~\342\241\275~Z~\342\241\265~a~\342\240\201~b~\342\240\203~c~\342\240\211~d~\342\240\231~e~\342\240\221~f~\342\240\213~g~\342\240\233~h~\342\240\223~i~\342\240\212~j~\342\240\232~k~\342\240\250~l~\342\240\207~m~\342\240\215~n~\342\240\235~o~\342\240\225~p~\342\240\217~q~\342\240\237~r~\342\240\227~s~\342\240\216~t~\342\240\236~u~\342\240\245~v~\342\240\247~w~\342\240\272~x~\342\240\255~y~\342\240\275~z~\342\240\265~\265~\342\240\215\342\240\211~\304~\342\241\234~\305~\342\241\241~\310~\342\241\256~\311~\342\241\277~\326~\342\241\252~\334~\342\241\263~\340~\342\240\267~\344~\342\240\234~\345~\342\240\241~\350~\342\240\256~\351~\342\240\277~\366~\342\240\252~\374~\342\240\263~\211~\342\240\217\342\240\215~\302\265~\342\240\215\342\240\211~\303\204~\342\241\234~\303\205~\342\241\241~\303\210~\342\241\256~\303\211~\342\241\277~\303\226~\342\241\252~\303\234~\342\241\263~\303\240~\342\240\267~\303\244~\342\240\234~\303\245~\342\240\241~\303\250~\342\240\256~\303\251~\342\240\277~\303\266~\342\240\252~\303\274~\342\240\263~\342\200\260~\342\240\217\342\240\215",tkn2,"~")/2;for(i=1;i<=ant;i++){tkn[tkn2[2*i-1]]=tkn2[2*i]};utrad=""}{for(i=1;i<=length($0);i++){utrad=utrad tkn[substr($0,i,1)]}}END{printf "%s", utrad} |
run-example | echo " !()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | text2bra.awk |
winwidth.awk (or if you like indents and dislike comments maybe winwidth.awk.indents works) | |
purpose | cut lines longer than the window's width |
code | BEGIN{haemtabredd="echo $COLUMNS";haemtabredd | getline bredden;close(haemtabredd)} {gsub(/\205/,".");print substr($0,1,bredden)} |
run-example | cat filename.txt | winwidth.awk |
ws2pajek.awk (or if you like indents and dislike comments maybe ws2pajek.awk.indents works) | |
purpose | filter based on journals and cited authors which builts pajek files from WoS files |
code | BEGIN{r="\\r\\n"}{gsub(/\r/,"");if($1=="CR"){$1=" ";cr=1};if((cr==1)&&($0~/^ /)){f=gensub(/^ *([^,]+),.*/,"\\1","1",$0);cf[f]++;li=li f ",";li}else{cr=0};if($1=="J9"){h=substr($0,4);ci[h]=ci[h] li;li="";t[h]++}}END{for(i in cf){j++;v=v j " \"" i "\"" r;fc[i]=j;cl=cl "1" r;e=e cf[i] r};for(i in ci){j++;e=e t[i] r;v=v j " \"" i "\"" r;sub(/,$/,"",ci[i]);for(k=split(ci[i],il,",");k>0;k--){a[j " " fc[il[k]]]++};cl=cl "2" r};for(i in a){ar=ar i " " a[i] r};j="echo -n \047*Vertices " j r;k="\047 > fileToPajek.";print j v "*Arcs" r ar k "net";print j cl k "clu";print j e k "vec"} |
run-example | cat filename.txt | ws2pajek.awk | sh |
xmlliner.awk (or if you like indents and dislike comments maybe xmlliner.awk.indents works) | |
purpose | removes line breaks and indentations in an XML file. Strings are assumed to be of type xs:token. See also xmlsvans.awk |
code | {gsub(/[\f\r\t ]+/," ");sub(/ $/,"");printf "%s",gensub(/^ </,"<","1")} |
run-example | cat filename.xml | xmlliner.awk > newfilename.xml |
xmlsvans.awk (or if you like indents and dislike comments maybe xmlsvans.awk.indents works) | |
purpose | Reminding of the lisp indent style, removes some useless line breaks and indentations in a traditionally indented XML file, thus (slightly) reducing the number of lines and the size of the file while keeping the file easy to read for humans. Also reduces multiple occurrence of the asterix character within comments. Note that xmlsvans.awk assumes that every xs:string is an xs:token according to the (not so intuitive) definition w3c has for the concept of xs:token. See also xmlliner.awk |
code | {gsub(/\r/,"");gsub(/ +$/,"");$0=gensub(/^[\t ]+(((<[\/\041]|[^ <]).*)[^ ]|)[\t ]*$/,"\\1","g");$0=gensub(/([^ \t])[\t ]+/,"\\1 ","g");do{a=$0;$0=gensub(/(<\041-- (\*?[^*>])*)(\*)\*+/,"\\1*","g",gensub(/(\*+)(\*(-?[^-])* -->)/,"\\2","g",a))}while($0!=a);printf "%s%s",(($0~/^(<\/.*|)$/||NR==1)?"":(($0~/^[^ <\]]/)?fdr:("\n"))),$0;fdr=(($0!~/^[^>]*>$/)?" ":"")} |
run-example | env XMLLINT_INDENT=" "" "" "" " xmllint --format --nsclean --postvalid filename.xml | xmlsvans.awk > newfilename.xml |
xtraline.awk (or if you like indents and dislike comments maybe xtraline.awk.indents works) | |
purpose | adds an empty line when lacking between each line in a text file. Compare with mkparagr.awk |
code | BEGIN{fdrad=0} {sub(/^[ \t\r]+$/,"");if($0!=""){if(fdrad){print "\n"}else{fdrad=1};printf "%s",$0}} |
run-example | cat filename.txt | xtraline.awk > newfilename.txt |