KidzSearch Encyclopedia:AutoEd/unicodify.js

< KidzSearch Encyclopedia:AutoEd

//<source lang=javascript> function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes

// Task 1: Replace named html entities with unicode

// Most common replacements
str = str.replace(/—/gi, '—');
str = str.replace(/–/gi, '–');

// Case insensitive symbols
if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) {
 //XML and HTML Symbols
 str = str.replace(/…/gi, '...');
 str = str.replace(/&plus;/gi, '+');
 str = str.replace(/±/gi, '±');
 str = str.replace(/−/gi, '−');
 str = str.replace(/×/gi, '×');
 str = str.replace(/÷/gi, '÷');
 str = str.replace(/≠/gi, '≠');
 str = str.replace(/≈/gi, '≈');
 str = str.replace(/≤/gi, '≤');
 str = str.replace(/≥/gi, '≥');
 str = str.replace(/"/gi, '"'); // "
 str = str.replace(/'/gi, "'"); // '
 str = str.replace(/¡/gi, '¡');
 str = str.replace(/¢/gi, '¢');
 str = str.replace(/£/gi, '£');
 str = str.replace(/¤/gi, '¤');
 str = str.replace(/¥/gi, '¥');
 str = str.replace(/¦/gi, '¦');
 str = str.replace(/§/gi, '§');
 str = str.replace(/¨/gi, '¨');
 str = str.replace(/©/gi, '©');
 str = str.replace(/ª/gi, 'ª');
 str = str.replace(/«/gi, '«');
 str = str.replace(/¬/gi, '¬');
 str = str.replace(/®/gi, '®');
 str = str.replace(/¯/gi, '¯');
 str = str.replace(/°/gi, '°');
 str = str.replace(/²/gi, '²');
 str = str.replace(/³/gi, '³');
 str = str.replace(/´/gi, '´');
 str = str.replace(/µ/gi, 'µ');
 str = str.replace(/¶/gi, '¶');
 str = str.replace(/·/gi, '·');
 str = str.replace(/¸/gi, '¸');
 str = str.replace(/¹/gi, '¹');
 str = str.replace(/º/gi, 'º');
 str = str.replace(/»/gi, '»');
 str = str.replace(/¼/gi, '¼');
 str = str.replace(/½/gi, '½');
 str = str.replace(/¾/gi, '¾');
 str = str.replace(/¿/gi, '¿');
 str = str.replace(/ˆ/gi, 'ˆ');
 str = str.replace(/˜/gi, '˜');
 str = str.replace(/‘/gi, '‘');
 str = str.replace(/’/gi, '’');
 str = str.replace(/‚/gi, '‚');
 str = str.replace(/“/gi, '“');
 str = str.replace(/”/gi, '”');
 str = str.replace(/„/gi, '„');
 str = str.replace(/•/gi, '•');
 str = str.replace(/‰/gi, '‰');
 str = str.replace(/‹/gi, '‹');
 str = str.replace(/›/gi, '›');
 str = str.replace(/‾/gi, '‾');
 str = str.replace(/⁄/gi, '⁄');
 str = str.replace(/€/gi, '€');
 str = str.replace(/ℑ/gi, 'ℑ');
 str = str.replace(/℘/gi, '℘');
 str = str.replace(/ℜ/gi, 'ℜ');
 str = str.replace(/™/gi, '™');
 str = str.replace(/ℵ/gi, 'ℵ');
 str = str.replace(/↵/gi, '↵');
 str = str.replace(/∀/gi, '∀');
 str = str.replace(/∂/gi, '∂');
 str = str.replace(/∃/gi, '∃');
 str = str.replace(/∅/gi, '∅');
 str = str.replace(/∇/gi, '∇');
 str = str.replace(/∈/gi, '∈');
 str = str.replace(/∉/gi, '∉');
 str = str.replace(/∋/gi, '∋');
 str = str.replace(/∏/gi, '∏');
 str = str.replace(/∑/gi, '∑');
 str = str.replace(/∗/gi, '∗');
 str = str.replace(/√/gi, '√');
 str = str.replace(/∝/gi, '∝');
 str = str.replace(/∞/gi, '∞');
 str = str.replace(/∠/gi, '∠');
 str = str.replace(/∧/gi, '∧');
 str = str.replace(/∨/gi, '∨');
 str = str.replace(/∩/gi, '∩');
 str = str.replace(/∪/gi, '∪');
 str = str.replace(/∫/gi, '∫');
 str = str.replace(/∴/gi, '∴');
 str = str.replace(/∼/gi, '∼');
 str = str.replace(/≅/gi, '≅');
 str = str.replace(/⊂/gi, '⊂');
 str = str.replace(/⊃/gi, '⊃');
 str = str.replace(/⊄/gi, '⊄');
 str = str.replace(/⊆/gi, '⊆');
 str = str.replace(/⊇/gi, '⊇');
 str = str.replace(/⊕/gi, '⊕');
 str = str.replace(/⊗/gi, '⊗');
 str = str.replace(/⊥/gi, '⊥');
 str = str.replace(/⋅/gi, '⋅');
 str = str.replace(/⌈/gi, '⌈');
 str = str.replace(/⌉/gi, '⌉');
 str = str.replace(/⌊/gi, '⌊');
 str = str.replace(/⌋/gi, '⌋');
 str = str.replace(/〈/gi, '〈');
 str = str.replace(/〉/gi, '〉');
 str = str.replace(/◊/gi, '◊');
 str = str.replace(/♠/gi, '♠');
 str = str.replace(/♣/gi, '♣');
 str = str.replace(/♥/gi, '♥');
 str = str.replace(/♦/gi, '♦');

}

// Uppercase symbols

if(str.search(/&[A-Z][a-z]+;/) >= 0) {
 //Greek symbols
 str = str.replace(/Α/g, 'Α');
 str = str.replace(/Β/g, 'Β');
 str = str.replace(/Γ/g, 'Γ');
 str = str.replace(/Δ/g, 'Δ');
 str = str.replace(/Ε/g, 'Ε');
 str = str.replace(/Ζ/g, 'Ζ');
 str = str.replace(/Η/g, 'Η');
 str = str.replace(/Θ/g, 'Θ');
 str = str.replace(/Ι/g, 'Ι');
 str = str.replace(/Κ/g, 'Κ');
 str = str.replace(/Λ/g, 'Λ');
 str = str.replace(/Μ/g, 'Μ');
 str = str.replace(/Ν/g, 'Ν');
 str = str.replace(/Ξ/g, 'Ξ');
 str = str.replace(/Ο/g, 'Ο');
 str = str.replace(/Π/g, 'Π');
 str = str.replace(/Ρ/g, 'Ρ');
 str = str.replace(/Σ/g, 'Σ');
 str = str.replace(/Τ/g, 'Τ');
 str = str.replace(/Υ/g, 'Υ');
 str = str.replace(/Φ/g, 'Φ');
 str = str.replace(/Χ/g, 'Χ');
 str = str.replace(/Ψ/g, 'Ψ');
 str = str.replace(/Ω/g, 'Ω');
 //Latin symbols
 str = str.replace(/À/g, 'À');
 str = str.replace(/Á/g, 'Á');
 str = str.replace(/Â/g, 'Â');
 str = str.replace(/Ã/g, 'Ã');
 str = str.replace(/Ä/g, 'Ä');
 str = str.replace(/Å/g, 'Å');
 str = str.replace(/Æ/g, 'Æ');
 str = str.replace(/Ç/g, 'Ç');
 str = str.replace(/È/g, 'È');
 str = str.replace(/É/g, 'É');
 str = str.replace(/Ê/g, 'Ê');
 str = str.replace(/Ë/g, 'Ë');
 str = str.replace(/Ì/g, 'Ì');
 str = str.replace(/Í/g, 'Í');
 str = str.replace(/Î/g, 'Î');
 str = str.replace(/Ï/g, 'Ï');
 str = str.replace(/Ñ/g, 'Ñ');
 str = str.replace(/Ò/g, 'Ò');
 str = str.replace(/Ó/g, 'Ó');
 str = str.replace(/Ô/g, 'Ô');
 str = str.replace(/Õ/g, 'Õ');
 str = str.replace(/Ö/g, 'Ö');
 str = str.replace(/Ø/g, 'Ø');
 str = str.replace(/Ù/g, 'Ù');
 str = str.replace(/Ú/g, 'Ú');
 str = str.replace(/Û/g, 'Û');
 str = str.replace(/Ü/g, 'Ü');
 str = str.replace(/Ý/g, 'Ý');
 str = str.replace(/Š/g, 'Š');
 str = str.replace(/Ÿ/g, 'Ÿ');
 //XML and HTML Symbols
 str = str.replace(/‡/g, '‡');
 str = str.replace(/″/g, '″');

}

// lowercase symbols

if(str.search(/&[a-z][a-z]+;/) >= 0) {
 //Greek symbols
 str = str.replace(/α/g, 'α');
 str = str.replace(/β/g, 'β');
 str = str.replace(/γ/g, 'γ');
 str = str.replace(/δ/g, 'δ');
 str = str.replace(/ε/g, 'ε');
 str = str.replace(/ζ/g, 'ζ');
 str = str.replace(/η/g, 'η');
 str = str.replace(/θ/g, 'θ');
 str = str.replace(/ι/g, 'ι');
 str = str.replace(/κ/g, 'κ');
 str = str.replace(/λ/g, 'λ');
 str = str.replace(/μ/g, 'μ');
 str = str.replace(/ν/g, 'ν');
 str = str.replace(/ξ/g, 'ξ');
 str = str.replace(/ο/g, 'ο');
 str = str.replace(/π/g, 'π');
 str = str.replace(/ρ/g, 'ρ');
 str = str.replace(/ς/g, 'ς');
 str = str.replace(/σ/g, 'σ');
 str = str.replace(/τ/g, 'τ');
 str = str.replace(/υ/g, 'υ');
 str = str.replace(/φ/g, 'φ');
 str = str.replace(/χ/g, 'χ');
 str = str.replace(/ψ/g, 'ψ');
 str = str.replace(/ω/g, 'ω');
 str = str.replace(/ϑ/g, 'ϑ');
 str = str.replace(/ϒ/g, 'ϒ');
 str = str.replace(/ϖ/g, 'ϖ');
 //Latin symbols
 str = str.replace(/ß/g, 'ß');
 str = str.replace(/à/g, 'à');
 str = str.replace(/á/g, 'á');
 str = str.replace(/â/g, 'â');
 str = str.replace(/ã/g, 'ã');
 str = str.replace(/ä/g, 'ä');
 str = str.replace(/å/g, 'å');
 str = str.replace(/æ/g, 'æ');
 str = str.replace(/ç/g, 'ç');
 str = str.replace(/è/g, 'è');
 str = str.replace(/é/g, 'é');
 str = str.replace(/ê/g, 'ê');
 str = str.replace(/ë/g, 'ë');
 str = str.replace(/ì/g, 'ì');
 str = str.replace(/í/g, 'í');
 str = str.replace(/î/g, 'î');
 str = str.replace(/ï/g, 'ï');
 str = str.replace(/ð/g, 'ð');
 str = str.replace(/ñ/g, 'ñ');
 str = str.replace(/ò/g, 'ò');
 str = str.replace(/ó/g, 'ó');
 str = str.replace(/ô/g, 'ô');
 str = str.replace(/õ/g, 'õ');
 str = str.replace(/ö/g, 'ö');
 str = str.replace(/ø/g, 'ø');
 str = str.replace(/ù/g, 'ù');
 str = str.replace(/ú/g, 'ú');
 str = str.replace(/û/g, 'û');
 str = str.replace(/ü/g, 'ü');
 str = str.replace(/ý/g, 'ý');
 str = str.replace(/þ/g, 'þ');
 str = str.replace(/ÿ/g, 'ÿ');
 str = str.replace(/œ/g, 'œ');
 str = str.replace(/š/g, 'š');
 str = str.replace(/ƒ/g, 'ƒ');
 //XML and HTML Symbols
 str = str.replace(/†/g, '†');
 str = str.replace(/′/g, '′');
}

// False positives
// Breaks large amounts of code which discuss programming/scripting.
// str = str.replace(/</gi, '<');
// str = str.replace(/>/gi, '>');
// Breaks large number of URLs and discussion of programming/scripting.
// str = str.replace(/&/gi, '&');

// Arrows
str = str.replace(/←/g, '←');
str = str.replace(/→/g, '→');
str = str.replace(/↑/g, '↑');
str = str.replace(/↓/g, '↓');
str = str.replace(/⇐/g, '⇐');
str = str.replace(/⇒/g, '⇒');
str = str.replace(/⇑/g, '⇑');
str = str.replace(/⇓/g, '⇓');
str = str.replace(/↔/g, '↔');
str = str.replace(/⇔/g, '⇔');
str = str.replace(/<==|<--/gi, '←');
str = str.replace(/==>/gi, '→');

// Specific case
str = str.replace(/Ð/g, 'Ð');
str = str.replace(/Þ/g, 'Þ');
str = str.replace(/Œ/g, 'Œ');


// Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb )

// Symbols for which there may be a good reason to obfuscate/escape
var dont_replace = "|!{}[]=<>";

// START specialreplace function from User:CharlotteWebb
function specialreplace(ent, base){
 var chr = "";
 var num = parseInt(ent.replace(/[\&\#\;x]/g, ), base);
 // see UTF-16 for chars outside the BMP
 // try this with Gothic letters at full volume ^_^
 if (num > 0xFFFF) {
  num -= 0x10000;
  chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF));  
 } else {
  chr = String.fromCharCode(num);
 }
 if (dont_replace.indexOf(chr) == -1) {
  str = str.replace(ent, chr, "gi");
 }
}
// END specialreplace function

// perform replacement
if(m = str.match(/\&\#(\d+)\;/g)) {
 for(i = 0; i < m.length; i++) {
  specialreplace(m[i], 10);
 }
}
if(m = str.match(/\&\#x([\da-f]+)\;/gi)) {
 for(i = 0; i < m.length; i++) { 
  specialreplace(m[i], 16);
 }
}

// Task 3: Unprintable control characters Windows-1252 from User:CharlotteWebb
var failstr = "";
str = str.replace(/\u0080/g, '€');
str = str.replace(/\u0081/g, failstr);
str = str.replace(/\u0082/g, '‚');
str = str.replace(/\u0083/g, 'ƒ');
str = str.replace(/\u0084/g, '„');
str = str.replace(/\u0085/g, '…');
str = str.replace(/\u0086/g, '†');
str = str.replace(/\u0087/g, '‡');
str = str.replace(/\u0088/g, 'ˆ');
str = str.replace(/\u0089/g, '‰');
str = str.replace(/\u008a/g, 'Š');
str = str.replace(/\u008b/g, '‹');
str = str.replace(/\u008c/g, 'Œ');
str = str.replace(/\u008d/g, failstr);
str = str.replace(/\u008e/g, 'Ž');
str = str.replace(/\u008f/g, failstr);
str = str.replace(/\u0090/g, failstr);
str = str.replace(/\u0091/g, '‘');
str = str.replace(/\u0092/g, '’');
str = str.replace(/\u0093/g, '“');
str = str.replace(/\u0094/g, '”');
str = str.replace(/\u0095/g, '•');
str = str.replace(/\u0096/g, '–');
str = str.replace(/\u0097/g, '—');
str = str.replace(/\u0098/g, '˜');
str = str.replace(/\u0099/g, '™');
str = str.replace(/\u009a/g, 'š');
str = str.replace(/\u009b/g, '›');
str = str.replace(/\u009c/g, 'œ');
str = str.replace(/\u009d/g, failstr);
str = str.replace(/\u009e/g, 'ž');
str = str.replace(/\u009f/g, 'Ÿ');

return str;

} //</source>