KidzSearch Encyclopedia:AutoEd/unicodify.js
//<source lang=javascript> function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes
// Task 1: Replace named html entities with unicode // Most common replacements str = str.replace(/—/gi, '—'); str = str.replace(/–/gi, '–'); // Case insensitive symbols if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) { //XML and HTML Symbols str = str.replace(/…/gi, '...'); str = str.replace(/+/gi, '+'); str = str.replace(/±/gi, '±'); str = str.replace(/−/gi, '−'); str = str.replace(/×/gi, '×'); str = str.replace(/÷/gi, '÷'); str = str.replace(/≠/gi, '≠'); str = str.replace(/≈/gi, '≈'); str = str.replace(/≤/gi, '≤'); str = str.replace(/≥/gi, '≥'); str = str.replace(/"/gi, '"'); // " str = str.replace(/'/gi, "'"); // ' str = str.replace(/¡/gi, '¡'); str = str.replace(/¢/gi, '¢'); str = str.replace(/£/gi, '£'); str = str.replace(/¤/gi, '¤'); str = str.replace(/¥/gi, '¥'); str = str.replace(/¦/gi, '¦'); str = str.replace(/§/gi, '§'); str = str.replace(/¨/gi, '¨'); str = str.replace(/©/gi, '©'); str = str.replace(/ª/gi, 'ª'); str = str.replace(/«/gi, '«'); str = str.replace(/¬/gi, '¬'); str = str.replace(/®/gi, '®'); str = str.replace(/¯/gi, '¯'); str = str.replace(/°/gi, '°'); str = str.replace(/²/gi, '²'); str = str.replace(/³/gi, '³'); str = str.replace(/´/gi, '´'); str = str.replace(/µ/gi, 'µ'); str = str.replace(/¶/gi, '¶'); str = str.replace(/·/gi, '·'); str = str.replace(/¸/gi, '¸'); str = str.replace(/¹/gi, '¹'); str = str.replace(/º/gi, 'º'); str = str.replace(/»/gi, '»'); str = str.replace(/¼/gi, '¼'); str = str.replace(/½/gi, '½'); str = str.replace(/¾/gi, '¾'); str = str.replace(/¿/gi, '¿'); str = str.replace(/ˆ/gi, 'ˆ'); str = str.replace(/˜/gi, '˜'); str = str.replace(/‘/gi, '‘'); str = str.replace(/’/gi, '’'); str = str.replace(/‚/gi, '‚'); str = str.replace(/“/gi, '“'); str = str.replace(/”/gi, '”'); str = str.replace(/„/gi, '„'); str = str.replace(/•/gi, '•'); str = str.replace(/‰/gi, '‰'); str = str.replace(/‹/gi, '‹'); str = str.replace(/›/gi, '›'); str = str.replace(/‾/gi, '‾'); str = str.replace(/⁄/gi, '⁄'); str = str.replace(/€/gi, '€'); str = str.replace(/ℑ/gi, 'ℑ'); str = str.replace(/℘/gi, '℘'); str = str.replace(/ℜ/gi, 'ℜ'); str = str.replace(/™/gi, '™'); str = str.replace(/ℵ/gi, 'ℵ'); str = str.replace(/↵/gi, '↵'); str = str.replace(/∀/gi, '∀'); str = str.replace(/∂/gi, '∂'); str = str.replace(/∃/gi, '∃'); str = str.replace(/∅/gi, '∅'); str = str.replace(/∇/gi, '∇'); str = str.replace(/∈/gi, '∈'); str = str.replace(/∉/gi, '∉'); str = str.replace(/∋/gi, '∋'); str = str.replace(/∏/gi, '∏'); str = str.replace(/∑/gi, '∑'); str = str.replace(/∗/gi, '∗'); str = str.replace(/√/gi, '√'); str = str.replace(/∝/gi, '∝'); str = str.replace(/∞/gi, '∞'); str = str.replace(/∠/gi, '∠'); str = str.replace(/∧/gi, '∧'); str = str.replace(/∨/gi, '∨'); str = str.replace(/∩/gi, '∩'); str = str.replace(/∪/gi, '∪'); str = str.replace(/∫/gi, '∫'); str = str.replace(/∴/gi, '∴'); str = str.replace(/∼/gi, '∼'); str = str.replace(/≅/gi, '≅'); str = str.replace(/⊂/gi, '⊂'); str = str.replace(/⊃/gi, '⊃'); str = str.replace(/⊄/gi, '⊄'); str = str.replace(/⊆/gi, '⊆'); str = str.replace(/⊇/gi, '⊇'); str = str.replace(/⊕/gi, '⊕'); str = str.replace(/⊗/gi, '⊗'); str = str.replace(/⊥/gi, '⊥'); str = str.replace(/⋅/gi, '⋅'); str = str.replace(/⌈/gi, '⌈'); str = str.replace(/⌉/gi, '⌉'); str = str.replace(/⌊/gi, '⌊'); str = str.replace(/⌋/gi, '⌋'); str = str.replace(/〈/gi, '〈'); str = str.replace(/〉/gi, '〉'); str = str.replace(/◊/gi, '◊'); str = str.replace(/♠/gi, '♠'); str = str.replace(/♣/gi, '♣'); str = str.replace(/♥/gi, '♥'); str = str.replace(/♦/gi, '♦');
}
// Uppercase symbols
if(str.search(/&[A-Z][a-z]+;/) >= 0) { //Greek symbols str = str.replace(/Α/g, 'Α'); str = str.replace(/Β/g, 'Β'); str = str.replace(/Γ/g, 'Γ'); str = str.replace(/Δ/g, 'Δ'); str = str.replace(/Ε/g, 'Ε'); str = str.replace(/Ζ/g, 'Ζ'); str = str.replace(/Η/g, 'Η'); str = str.replace(/Θ/g, 'Θ'); str = str.replace(/Ι/g, 'Ι'); str = str.replace(/Κ/g, 'Κ'); str = str.replace(/Λ/g, 'Λ'); str = str.replace(/Μ/g, 'Μ'); str = str.replace(/Ν/g, 'Ν'); str = str.replace(/Ξ/g, 'Ξ'); str = str.replace(/Ο/g, 'Ο'); str = str.replace(/Π/g, 'Π'); str = str.replace(/Ρ/g, 'Ρ'); str = str.replace(/Σ/g, 'Σ'); str = str.replace(/Τ/g, 'Τ'); str = str.replace(/Υ/g, 'Υ'); str = str.replace(/Φ/g, 'Φ'); str = str.replace(/Χ/g, 'Χ'); str = str.replace(/Ψ/g, 'Ψ'); str = str.replace(/Ω/g, 'Ω'); //Latin symbols str = str.replace(/À/g, 'À'); str = str.replace(/Á/g, 'Á'); str = str.replace(/Â/g, 'Â'); str = str.replace(/Ã/g, 'Ã'); str = str.replace(/Ä/g, 'Ä'); str = str.replace(/Å/g, 'Å'); str = str.replace(/Æ/g, 'Æ'); str = str.replace(/Ç/g, 'Ç'); str = str.replace(/È/g, 'È'); str = str.replace(/É/g, 'É'); str = str.replace(/Ê/g, 'Ê'); str = str.replace(/Ë/g, 'Ë'); str = str.replace(/Ì/g, 'Ì'); str = str.replace(/Í/g, 'Í'); str = str.replace(/Î/g, 'Î'); str = str.replace(/Ï/g, 'Ï'); str = str.replace(/Ñ/g, 'Ñ'); str = str.replace(/Ò/g, 'Ò'); str = str.replace(/Ó/g, 'Ó'); str = str.replace(/Ô/g, 'Ô'); str = str.replace(/Õ/g, 'Õ'); str = str.replace(/Ö/g, 'Ö'); str = str.replace(/Ø/g, 'Ø'); str = str.replace(/Ù/g, 'Ù'); str = str.replace(/Ú/g, 'Ú'); str = str.replace(/Û/g, 'Û'); str = str.replace(/Ü/g, 'Ü'); str = str.replace(/Ý/g, 'Ý'); str = str.replace(/Š/g, 'Š'); str = str.replace(/Ÿ/g, 'Ÿ'); //XML and HTML Symbols str = str.replace(/‡/g, '‡'); str = str.replace(/″/g, '″');
}
// lowercase symbols
if(str.search(/&[a-z][a-z]+;/) >= 0) { //Greek symbols str = str.replace(/α/g, 'α'); str = str.replace(/β/g, 'β'); str = str.replace(/γ/g, 'γ'); str = str.replace(/δ/g, 'δ'); str = str.replace(/ε/g, 'ε'); str = str.replace(/ζ/g, 'ζ'); str = str.replace(/η/g, 'η'); str = str.replace(/θ/g, 'θ'); str = str.replace(/ι/g, 'ι'); str = str.replace(/κ/g, 'κ'); str = str.replace(/λ/g, 'λ'); str = str.replace(/μ/g, 'μ'); str = str.replace(/ν/g, 'ν'); str = str.replace(/ξ/g, 'ξ'); str = str.replace(/ο/g, 'ο'); str = str.replace(/π/g, 'π'); str = str.replace(/ρ/g, 'ρ'); str = str.replace(/ς/g, 'ς'); str = str.replace(/σ/g, 'σ'); str = str.replace(/τ/g, 'τ'); str = str.replace(/υ/g, 'υ'); str = str.replace(/φ/g, 'φ'); str = str.replace(/χ/g, 'χ'); str = str.replace(/ψ/g, 'ψ'); str = str.replace(/ω/g, 'ω'); str = str.replace(/ϑ/g, 'ϑ'); str = str.replace(/ϒ/g, 'ϒ'); str = str.replace(/ϖ/g, 'ϖ'); //Latin symbols str = str.replace(/ß/g, 'ß'); str = str.replace(/à/g, 'à'); str = str.replace(/á/g, 'á'); str = str.replace(/â/g, 'â'); str = str.replace(/ã/g, 'ã'); str = str.replace(/ä/g, 'ä'); str = str.replace(/å/g, 'å'); str = str.replace(/æ/g, 'æ'); str = str.replace(/ç/g, 'ç'); str = str.replace(/è/g, 'è'); str = str.replace(/é/g, 'é'); str = str.replace(/ê/g, 'ê'); str = str.replace(/ë/g, 'ë'); str = str.replace(/ì/g, 'ì'); str = str.replace(/í/g, 'í'); str = str.replace(/î/g, 'î'); str = str.replace(/ï/g, 'ï'); str = str.replace(/ð/g, 'ð'); str = str.replace(/ñ/g, 'ñ'); str = str.replace(/ò/g, 'ò'); str = str.replace(/ó/g, 'ó'); str = str.replace(/ô/g, 'ô'); str = str.replace(/õ/g, 'õ'); str = str.replace(/ö/g, 'ö'); str = str.replace(/ø/g, 'ø'); str = str.replace(/ù/g, 'ù'); str = str.replace(/ú/g, 'ú'); str = str.replace(/û/g, 'û'); str = str.replace(/ü/g, 'ü'); str = str.replace(/ý/g, 'ý'); str = str.replace(/þ/g, 'þ'); str = str.replace(/ÿ/g, 'ÿ'); str = str.replace(/œ/g, 'œ'); str = str.replace(/š/g, 'š'); str = str.replace(/ƒ/g, 'ƒ'); //XML and HTML Symbols str = str.replace(/†/g, '†'); str = str.replace(/′/g, '′'); } // False positives // Breaks large amounts of code which discuss programming/scripting. // str = str.replace(/</gi, '<'); // str = str.replace(/>/gi, '>'); // Breaks large number of URLs and discussion of programming/scripting. // str = str.replace(/&/gi, '&'); // Arrows str = str.replace(/←/g, '←'); str = str.replace(/→/g, '→'); str = str.replace(/↑/g, '↑'); str = str.replace(/↓/g, '↓'); str = str.replace(/⇐/g, '⇐'); str = str.replace(/⇒/g, '⇒'); str = str.replace(/⇑/g, '⇑'); str = str.replace(/⇓/g, '⇓'); str = str.replace(/↔/g, '↔'); str = str.replace(/⇔/g, '⇔'); str = str.replace(/<==|<--/gi, '←'); str = str.replace(/==>/gi, '→'); // Specific case str = str.replace(/Ð/g, 'Ð'); str = str.replace(/Þ/g, 'Þ'); str = str.replace(/Œ/g, 'Œ'); // Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb ) // Symbols for which there may be a good reason to obfuscate/escape var dont_replace = "|!{}[]=<>"; // START specialreplace function from User:CharlotteWebb function specialreplace(ent, base){ var chr = ""; var num = parseInt(ent.replace(/[\&\#\;x]/g, ), base); // see UTF-16 for chars outside the BMP // try this with Gothic letters at full volume ^_^ if (num > 0xFFFF) { num -= 0x10000; chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF)); } else { chr = String.fromCharCode(num); } if (dont_replace.indexOf(chr) == -1) { str = str.replace(ent, chr, "gi"); } } // END specialreplace function // perform replacement if(m = str.match(/\&\#(\d+)\;/g)) { for(i = 0; i < m.length; i++) { specialreplace(m[i], 10); } } if(m = str.match(/\&\#x([\da-f]+)\;/gi)) { for(i = 0; i < m.length; i++) { specialreplace(m[i], 16); } } // Task 3: Unprintable control characters Windows-1252 from User:CharlotteWebb var failstr = ""; str = str.replace(/\u0080/g, '€'); str = str.replace(/\u0081/g, failstr); str = str.replace(/\u0082/g, '‚'); str = str.replace(/\u0083/g, 'ƒ'); str = str.replace(/\u0084/g, '„'); str = str.replace(/\u0085/g, '…'); str = str.replace(/\u0086/g, '†'); str = str.replace(/\u0087/g, '‡'); str = str.replace(/\u0088/g, 'ˆ'); str = str.replace(/\u0089/g, '‰'); str = str.replace(/\u008a/g, 'Š'); str = str.replace(/\u008b/g, '‹'); str = str.replace(/\u008c/g, 'Œ'); str = str.replace(/\u008d/g, failstr); str = str.replace(/\u008e/g, 'Ž'); str = str.replace(/\u008f/g, failstr); str = str.replace(/\u0090/g, failstr); str = str.replace(/\u0091/g, '‘'); str = str.replace(/\u0092/g, '’'); str = str.replace(/\u0093/g, '“'); str = str.replace(/\u0094/g, '”'); str = str.replace(/\u0095/g, '•'); str = str.replace(/\u0096/g, '–'); str = str.replace(/\u0097/g, '—'); str = str.replace(/\u0098/g, '˜'); str = str.replace(/\u0099/g, '™'); str = str.replace(/\u009a/g, 'š'); str = str.replace(/\u009b/g, '›'); str = str.replace(/\u009c/g, 'œ'); str = str.replace(/\u009d/g, failstr); str = str.replace(/\u009e/g, 'ž'); str = str.replace(/\u009f/g, 'Ÿ'); return str;
} //</source>