Распознавание языка в простом JS (en/ger) - необходимы улучшения
Я написал простой код, чтобы определить, написано ли сообщение на английском или немецком языке. "languageAnalysis()" будет вызываться через "identifLanguge()". Смотрите мой ответ для последнего.
Код работает, но я ищу:
- Примеры текстов на английском или немецком языке, которые не работают
- улучшения, чтобы получить лучшие результаты (примерные идеи:)
- частота заглавных букв
- частота пробелов
- средняя длина предложений
Не стесняйтесь участвовать:)
function languageAnalysis(text) {
indicator = [];
indicatorReliability = [];
indicatorType = [];
germanIndicator = 0;
englishIndicator = 0;
language = "undefined";
text = text.toLowerCase();
//BLOCK 1: Single Character frequency
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"];
germanLetterChances = ["6.5","1.9","3.0","5.1","17.4","1.7","3.0","4.8","7.6","0.3","1.2","3.4","2.5","9.8","2.5","0.8","0.02","7.0","7.3","6.2","4.4","0.7","1.9","0.03","0.04","1.1"];
englishLetterChances = ["8.2","1.5","2.8","4.3","12.7","2.2","2.0","6.1","7.0","0.2","0.8","4.0","2.4","6.7","7.5","1.9","0.1","6.0","6.3","9.1","2.8","1.0","2.4","0.2","2.0","0.1"];
for(iiii = 0; iiii<letters.length; iiii++){
if(text.match(letters[iiii])){
frequency = text.split(letters[iiii]).length;
frequency = parseInt(frequency)-1;
frequency = (frequency/text.length)*100;
gerLetterChance = parseFloat(germanLetterChances[iiii]);
engLetterChance = parseFloat(englishLetterChances[iiii]);
if(frequency > Math.max(gerLetterChance,engLetterChance)){
if(Math.max(gerLetterChance,engLetterChance) == engLetterChance){
englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
}else{
germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
}
}else if(frequency < Math.min(gerLetterChance,engLetterChance)){
if(Math.min(gerLetterChance,engLetterChance) == engLetterChance){
englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
}else{
germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
}
}else{
if(frequency - Math.min(gerLetterChance,engLetterChance) > Math.max(gerLetterChance,engLetterChance) - frequency){
if(Math.max(gerLetterChance,engLetterChance) == engLetterChance){
englishIndicator = englishIndicator + frequency - Math.min(gerLetterChance,engLetterChance);
}else{
germanIndicator = germanIndicator + frequency - Math.min(gerLetterChance,engLetterChance);
}
}else{
if(Math.min(gerLetterChance,engLetterChance) == engLetterChance){
englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - frequency;
}else{
germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - frequency;
}
}
}
}
}
if(germanIndicator > englishIndicator){
indicator.push("german");
indicatorReliability.push( (germanIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Character-Frequency");
}else if(englishIndicator > germanIndicator){
indicator.push("english");
indicatorReliability.push( (englishIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Character-Frequency");
}
//BLOCK 2: Bigramm frequency
germanIndicator = 0;
englishIndicator = 0;
bigramms = ["aa","ab","ac","ad","ae","af","ag","ah","ai","aj","ak","al","am","an","ao","ap","aq","ar","as","at","au","av","aw","ax","ay","az","ba","bb","bc","bd","be","bf","bg","bh","bi","bj","bk","bl","bm","bn","bo","bp","bq","br","bs","bt","bu","bv","bw","bx","by","bz","ca","cb","cc","cd","ce","cf","cg","ch","ci","cj","ck","cl","cm","cn","co","cp","cq","cr","cs","ct","cu","cv","cw","cx","cy","cz","da","db","dc","dd","de","df","dg","dh","di","dj","dk","dl","dm","dn","do","dp","dq","dr","ds","dt","du","dv","dw","dx","dy","dz","ea","eb","ec","ed","ee","ef","eg","eh","ei","ej","ek","el","em","en","eo","ep","eq","er","es","et","eu","ev","ew","ex","ey","ez","fa","fb","fc","fd","fe","ff","fg","fh","fi","fj","fk","fl","fm","fn","fo","fp","fq","fr","fs","ft","fu","fv","fw","fx","fy","fz","ga","gb","gc","gd","ge","gf","gg","gh","gi","gj","gk","gl","gm","gn","go","gp","gq","gr","gs","gt","gu","gv","gw","gx","gy","gz","ha","hb","hc","hd","he","hf","hg","hh","hi","hj","hk","hl","hm","hn","ho","hp","hq","hr","hs","ht","hu","hv","hw","hx","hy","hz","ia","ib","ic","id","ie","if","ig","ih","ii","ij","ik","il","im","in","io","ip","iq","ir","is","it","iu","iv","iw","ix","iy","iz","ja","jb","jc","jd","je","jf","jg","jh","ji","jj","jk","jl","jm","jn","jo","jp","jq","jr","js","jt","ju","jv","jw","jx","jy","jz","ka","kb","kc","kd","ke","kf","kg","kh","ki","kj","kk","kl","km","kn","ko","kp","kq","kr","ks","kt","ku","kv","kw","kx","ky","kz","la","lb","lc","ld","le","lf","lg","lh","li","lj","lk","ll","lm","ln","lo","lp","lq","lr","ls","lt","lu","lv","lw","lx","ly","lz","ma","mb","mc","md","me","mf","mg","mh","mi","mj","mk","ml","mm","mn","mo","mp","mq","mr","ms","mt","mu","mv","mw","mx","my","mz","na","nb","nc","nd","ne","nf","ng","nh","ni","nj","nk","nl","nm","nn","no","np","nq","nr","ns","nt","nu","nv","nw","nx","ny","nz","oa","ob","oc","od","oe","of","og","oh","oi","oj","ok","ol","om","on","oo","op","oq","or","os","ot","ou","ov","ow","ox","oy","oz","pa","pb","pc","pd","pe","pf","pg","ph","pi","pj","pk","pl","pm","pn","po","pp","pq","pr","ps","pt","pu","pv","pw","px","py","pz","qa","qb","qc","qd","qe","qf","qg","qh","qi","qj","qk","ql","qm","qn","qo","qp","qq","qr","qs","qt","qu","qv","qw","qx","qy","qz","ra","rb","rc","rd","re","rf","rg","rh","ri","rj","rk","rl","rm","rn","ro","rp","rq","rr","rs","rt","ru","rv","rw","rx","ry","rz","sa","sb","sc","sd","se","sf","sg","sh","si","sj","sk","sl","sm","sn","so","sp","sq","sr","ss","st","su","sv","sw","sx","sy","sz","ta","tb","tc","td","te","tf","tg","th","ti","tj","tk","tl","tm","tn","to","tp","tq","tr","ts","tt","tu","tv","tw","tx","ty","tz","ua","ub","uc","ud","ue","uf","ug","uh","ui","uj","uk","ul","um","un","uo","up","uq","ur","us","ut","uu","uv","uw","ux","uy","uz","va","vb","vc","vd","ve","vf","vg","vh","vi","vj","vk","vl","vm","vn","vo","vp","vq","vr","vs","vt","vu","vv","vw","vx","vy","vz","wa","wb","wc","wd","we","wf","wg","wh","wi","wj","wk","wl","wm","wn","wo","wp","wq","wr","ws","wt","wu","wv","ww","wx","wy","wz","xa","xb","xc","xd","xe","xf","xg","xh","xi","xj","xk","xl","xm","xn","xo","xp","xq","xr","xs","xt","xu","xv","xw","xx","xy","xz","ya","yb","yc","yd","ye","yf","yg","yh","yi","yj","yk","yl","ym","yn","yo","yp","yq","yr","ys","yt","yu","yv","yw","yx","yy","yz","za","zb","zc","zd","ze","zf","zg","zh","zi","zj","zk","zl","zm","zn","zo","zp","zq","zr","zs","zt","zu","zv","zw","zx","zy","zz"];
germanBigramChances = ["8","31","27","11","64","15","30","20","5","1","7","59","28","102","0","4","0","51","53","46","75","2","3","0","1","2","16","1","0","1","101","0","3","1","12","0","1","9","0","1","8","0","0","9","6","4","14","0","1","0","1","1","2","0","0","2","1","0","0","243","1","0","14","1","0","0","2","0","0","0","1","0","0","0","0","0","0","0","54","3","1","13","228","3","4","2","93","1","3","5","4","6","9","3","0","10","11","6","16","3","4","0","0","3","26","45","25","51","23","26","50","57","193","3","19","63","55","402","6","13","1","410","140","55","36","14","23","2","1","11","19","2","0","9","25","12","3","1","7","0","1","5","1","2","9","1","0","18","4","20","24","1","1","0","0","1","20","3","0","12","147","2","3","3","19","1","3","9","3","5","6","1","0","14","18","18","11","4","3","0","0","3","70","4","1","14","103","2","4","3","23","1","3","25","11","19","18","1","0","37","11","47","11","4","9","0","0","3","7","7","76","20","163","5","38","12","1","1","12","25","27","168","20","2","0","17","79","78","3","5","1","0","0","5","7","0","0","0","9","5","0","0","0","0","0","0","0","0","2","0","0","0","0","0","5","0","0","0","0","0","28","1","0","2","26","1","1","1","7","0","1","10","1","1","24","1","0","13","5","14","9","1","1","0","0","1","45","7","2","14","65","5","6","2","61","1","7","42","3","4","14","2","0","2","22","27","13","3","2","0","0","3","40","6","1","8","50","4","4","3","44","2","3","4","23","3","15","7","0","2","10","8","14","4","3","0","0","2","68","23","5","187","123","19","94","17","65","5","25","10","23","43","18","10","0","10","74","59","33","18","29","0","0","25","3","8","15","7","25","6","5","9","1","1","3","31","17","64","1","6","0","50","19","9","3","3","7","0","1","6","16","0","0","3","10","6","0","2","4","0","0","4","0","0","11","5","0","23","1","3","4","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","2","0","0","0","0","0","80","25","9","67","112","18","27","19","52","4","23","18","20","31","30","9","0","15","54","49","48","12","17","0","0","14","36","10","89","20","99","7","13","9","65","2","11","9","12","7","28","22","0","8","76","116","15","9","10","0","2","7","57","8","1","35","186","5","10","14","59","2","4","11","9","9","15","3","0","31","50","23","26","8","21","0","1","26","3","8","16","5","78","27","8","4","2","0","3","7","21","120","0","5","0","33","48","23","1","3","2","0","0","1","3","0","0","0","37","0","0","0","9","0","0","0","0","0","43","0","0","0","0","0","0","0","0","0","0","0","34","0","0","0","48","0","0","0","36","1","0","0","0","1","17","0","0","0","1","0","9","0","0","0","0","0","0","0","0","0","0","0","0","0","1","0","0","0","0","0","1","0","0","0","0","1","0","0","0","0","0","0","0","0","0","0","1","0","0","0","0","0","0","1","1","0","0","0","0","0","1","0","0","0","0","0","0","0","4","1","0","1","28","0","1","0","11","0","1","2","1","0","2","0","0","0","1","7","43","1","9","0","0","1"]; //sum: 10000
englishBigramChances = ["1","32","39","15","0","10","18","0","16","0","10","77","18","177","2","31","1","106","67","124","12","24","7","0","27","1","8","0","0","0","58","0","0","0","6","2","0","21","1","0","11","0","0","6","5","0","25","0","0","0","19","0","44","0","12","0","55","1","0","46","15","0","8","16","0","0","59","1","0","7","1","38","16","0","1","0","0","0","45","18","4","10","39","12","2","3","57","1","0","7","9","5","37","7","1","10","32","39","8","4","9","0","6","0","65","11","64","107","39","23","20","15","40","1","2","46","43","125","46","32","14","154","145","80","7","16","41","17","17","0","21","2","9","1","25","14","1","6","26","1","0","10","3","2","38","3","0","4","8","42","11","1","4","0","1","0","11","2","1","1","32","3","1","16","10","0","0","4","1","3","23","1","0","21","7","18","8","0","2","0","1","0","84","1","2","1","251","2","0","5","72","0","0","3","1","2","46","1","0","8","3","22","2","0","7","0","1","0","18","7","55","16","37","27","10","0","0","0","8","39","32","169","63","3","0","21","106","88","0","14","1","1","0","4","0","0","0","0","2","0","0","0","0","0","0","0","0","0","4","0","0","0","0","0","4","0","0","0","0","0","0","0","0","0","28","0","0","0","8","0","0","0","0","3","3","0","0","0","2","1","0","0","3","0","3","0","34","7","8","28","72","5","1","0","57","1","3","60","4","1","28","2","2","2","12","19","8","2","5","0","47","0","56","9","1","2","48","0","0","1","26","0","0","0","5","3","28","16","0","0","6","6","13","0","2","0","3","0","54","7","31","118","64","8","75","9","37","3","3","10","7","9","65","7","0","5","51","110","12","4","15","1","14","0","9","18","18","16","3","94","3","3","13","0","5","17","44","145","23","29","0","118","37","53","96","13","36","0","4","2","21","1","0","0","40","0","0","7","8","0","0","29","0","0","28","26","42","3","14","7","0","1","0","2","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","20","0","0","0","0","0","57","4","14","16","148","6","6","3","77","1","11","12","15","17","54","8","0","18","39","63","6","5","10","0","17","0","75","13","21","6","84","13","6","30","42","0","2","6","14","19","71","24","2","6","41","121","30","2","27","0","4","0","56","14","6","9","94","5","1","325","135","0","0","12","14","8","121","8","0","30","32","53","22","4","16","0","21","0","18","5","17","11","11","1","12","2","5","0","0","28","9","33","2","17","0","49","42","45","0","0","0","1","1","1","15","0","0","0","53","0","0","0","19","0","0","0","0","0","6","0","0","0","0","0","0","0","0","0","0","0","32","0","3","4","30","1","0","48","37","0","0","4","1","10","17","2","0","1","3","6","1","1","2","0","0","0","3","0","5","0","1","0","0","0","4","0","0","0","0","0","1","4","0","0","0","1","1","0","0","0","0","0","11","11","10","4","12","3","5","5","18","0","0","6","4","3","28","7","0","5","17","21","1","3","14","0","0","0","0","0","0","0","5","0","0","0","2","0","0","1","0","0","0","0","0","0","0","0","0","0","0","0","0","1"]; //sum: 10000
gerBigramFrequency = 0;
engBigramFrequency = 0;
for(iiii = 0; iiii<bigramms.length; iiii++){
if(text.match(bigramms[iiii])){
gerBigramFrequency = parseFloat(gerBigramFrequency) + parseFloat(germanBigramChances[iiii]);
engBigramFrequency = parseFloat(engBigramFrequency) + parseFloat(englishBigramChances[iiii]);
}
}
if(gerBigramFrequency > engBigramFrequency){
indicator.push("german");
indicatorReliability.push( (gerBigramFrequency/(gerBigramFrequency+engBigramFrequency))*100 );
indicatorType.push("Bigramm-Frequency");
if(gerBigramFrequency > (2*engBigramFrequency)){
indicator.push("german");
indicatorReliability.push(100);
indicatorType.push("Bigramm-Frequency-Obvious");
}
}else if(engBigramFrequency > gerBigramFrequency){
indicator.push("english");
indicatorReliability.push( (engBigramFrequency/(gerBigramFrequency+engBigramFrequency))*100 );
indicatorType.push("Bigramm-Frequency");
if(engBigramFrequency > (2*gerBigramFrequency)){
indicator.push("english");
indicatorReliability.push(100);
indicatorType.push("Bigramm-Frequency-Obvious");
}
}
//BLOCK 3: Trigram frequency
germanIndicator = 0;
englishIndicator = 0;
trigramms = ["ich","nde","und","der","ing","and","ent"];
germanTrigramChances = ["1.11","0.89","0.87","0.86","0.21","0.25","0.26"];
englishTrigramChances = ["0.23","0.23","0.33","0.28","1.11","1.02","0.73"];
for(iiii = 0; iiii<trigramms.length; iiii++){
if(text.match(trigramms[iiii])){
frequency = text.split(trigramms[iiii]).length;
frequency = parseInt(frequency)-1;
frequency = ((frequency*3)/text.length)*100;
gerTrigramChance = parseFloat(germanTrigramChances[iiii]);
engTrigramChance = parseFloat(englishTrigramChances[iiii]);
if(frequency > Math.max(gerTrigramChance,engTrigramChance)){
if(Math.max(gerTrigramChance,engTrigramChance) == engTrigramChance){
englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
}else{
germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
}
}else if(frequency < Math.min(gerTrigramChance,engTrigramChance)){
if(Math.min(gerTrigramChance,engTrigramChance) == engTrigramChance){
englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
}else{
germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
}
}else{
if(frequency - Math.min(gerTrigramChance,engTrigramChance) > Math.max(gerTrigramChance,engTrigramChance) - frequency){
if(Math.max(gerTrigramChance,engTrigramChance) == engTrigramChance){
englishIndicator = englishIndicator + frequency - Math.min(gerTrigramChance,engTrigramChance);
}else{
germanIndicator = germanIndicator + frequency - Math.min(gerTrigramChance,engTrigramChance);
}
}else{
if(Math.min(gerTrigramChance,engTrigramChance) == engTrigramChance){
englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - frequency;
}else{
germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - frequency;
}
}
}
}
}
if(germanIndicator > englishIndicator){
indicator.push("german");
indicatorReliability.push( (germanIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Trigramm-Frequency");
}else if(englishIndicator > germanIndicator){
indicator.push("english");
indicatorReliability.push( (englishIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Trigramm-Frequency");
}
//BLOCK 4: German characters
germanLetters = ["ä","ö","ü","ß"];
gerGermanLettersChance = ["0.51","0.36","0.64","0.19"];
germanLettersIncl = germanLetters.filter(el => text.includes(el));
if(germanLettersIncl.length > 0){
indicator.push("german");
indicatorReliability.push("100");
indicatorType.push("German-Characters");
}
//BLOCK 5: First Letter Frequency
firstLetters = ["a","c","d","e","o","t"];
germanFirstChances = ["6","0.1","14.25","7.9","1","1.5"];
englishFirstChances = ["11.7","5.24","3.17","2.8","7.63","15.98"];
germanIndicator = 0;
englishIndicator = 0;
words = text.replace(/[" "]/g," ");
words = words.replace(/[" "]/g," ");
words = words.replace(/[" ? "]/g,"?");
words = words.replace(/["? "]/g,"?");
words = words.replace(/[" ?"]/g,"?");
words = words.replace(/[?]/g,"§");
words = words.replace(/[" ! "]/g,"!");
words = words.replace(/["! "]/g,"!");
words = words.replace(/[" !"]/g,"!");
words = words.replace(/[!]/g,"§");
words = words.replace(/[" . "]/g,".");
words = words.replace(/[". "]/g,".");
words = words.replace(/[" ."]/g,".");
words = words.replace(/[.]/g,"§");
words = words.replace(/[" , "]/g,",");
words = words.replace(/[", "]/g,",");
words = words.replace(/[" ,"]/g,",");
words = words.replace(/[,]/g,"§");
words = words.replace(/[" : "]/g,":");
words = words.replace(/[": "]/g,":");
words = words.replace(/[" :"]/g,":");
words = words.replace(/[:]/g,"§");
words = words.replace(/[" ; "]/g,";");
words = words.replace(/["; "]/g,";");
words = words.replace(/[" ;"]/g,";");
words = words.replace(/[;]/g,"§");
words = words.replace(/[" ' "]/g,"'");
words = words.replace(/["' "]/g,"'");
words = words.replace(/[" '"]/g,"'");
words = words.replace(/[']/g,"§");
words = words.replace(/[" "]/g,"§")
words = words.split("§");
firstLetterStr = "";
for(iiii=0; iiii < words.length-1; iiii++){
firstLetterStr = firstLetterStr + words[iiii].charAt(0);
}
for(iiii = 0; iiii<firstLetters.length; iiii++){
if(firstLetterStr.match(firstLetters[iiii])){
frequency = firstLetterStr.split(firstLetters[iiii]).length;
frequency = parseInt(frequency)-1;
frequency = (frequency/firstLetterStr.length)*100;
gerFirstChance = parseFloat(germanFirstChances[iiii]);
engFirstChance = parseFloat(englishFirstChances[iiii]);
if(frequency > Math.max(gerFirstChance,engFirstChance)){
if(Math.max(gerFirstChance,engFirstChance) == engFirstChance){
englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
}else{
germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
}
}else if(frequency < Math.min(gerFirstChance,engFirstChance)){
if(Math.min(gerFirstChance,engFirstChance) == engFirstChance){
englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
}else{
germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
}
}else{
if(frequency - Math.min(gerFirstChance,engFirstChance) > Math.max(gerFirstChance,engFirstChance) - frequency){
if(Math.max(gerFirstChance,engFirstChance) == engFirstChance){
englishIndicator = englishIndicator + frequency - Math.min(gerFirstChance,engFirstChance);
}else{
germanIndicator = germanIndicator + frequency - Math.min(gerFirstChance,engFirstChance);
}
}else{
if(Math.min(gerFirstChance,engFirstChance) == engFirstChance){
englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - frequency;
}else{
germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - frequency;
}
}
}
}
}
if(germanIndicator > englishIndicator){
indicator.push("german");
indicatorReliability.push( (germanIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("First-Letters");
}else if(englishIndicator > germanIndicator){
indicator.push("english");
indicatorReliability.push( (englishIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("First-Letters");
}
//BLOCK 6: Frequent words
germanIndicator = 0;
englishIndicator = 0;
gerfrequentWords = ["der","die","und","millionen","den","von","zu","das","mit","sich","des","auf","für","ist","im","dem","nicht","ein","seiner","eine","als","auch","es","worden","werden","aus","er","hat","dass","was","nach","wird","bei","einer","will","um","am","sind","noch","wie","einem","über","einen","zwischen","so","sie","zum","war","haben","nur","oder","aber","vor","zur","bis","mehr","durch","man","sein","wurde","sei","sagte","prozent","hatte","kann","gegen","vom","können","schon","wenn","habe","seine","mark","ihre","dann","unter","wir","soll","ich","eines","immer","jahr","zwei","jahren","diese","dieser","wieder","keine","uhr","tag","du","mein","gut","heute"];
engfrequentWords = ["the","of","and","a","to","is","you","that","it","he","was","for","on","are","as","with","his","they","i","at","be","this","have","from","or","one","had","by","word","but","not","what","all","were","we","when","your","can","said","there","use","each","which","she","do","how","their","if","will","up","other","about","out","many","then","them","these","so","some","her","would","make","like","him","into","time","has","look","two","more","write","go","see","may","no","way","could","made","my","than","first","come","been","call","who","oil","its","now","find","long","down","day","did","get"];
for(iiii = 0; iiii<words.length; iiii++){
for(iiiii = 0; iiiii<gerfrequentWords.length; iiiii++){
if(words[iiii] == gerfrequentWords[iiiii]){
germanIndicator++;
break;
}
if(words[iiii] == engfrequentWords[iiiii]){
englishIndicator++;
break;
}
}
}
if(germanIndicator > englishIndicator){
indicator.push("german");
indicatorReliability.push( (germanIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Frequent-Words");
if(germanIndicator > 4 && englishIndicator == 0){
indicator.push("german");
indicatorReliability.push(100);
indicatorType.push("Frequent-Words-Obvious");
}
}else if(englishIndicator > germanIndicator){
indicator.push("english");
indicatorReliability.push( (englishIndicator/(germanIndicator+englishIndicator))*100 );
indicatorType.push("Frequent-Words");
if(englishIndicator > 4 && germanIndicator == 0){
indicator.push("english");
indicatorReliability.push(100);
indicatorType.push("Frequent-Words-Obvious");
}
}
//BLOCK 7: Trigram-Frequency-Comparison (Language-based-Collections)
gerTrigrams = ["ein","die","che","end","gen","sch","cht","den","nge","nun","ung","das","hen","ind","enw","ens","ies"];
gerTrigramChance = ["1.22","0.87","0.75","0.75","0.71","0.66","0.61","0.57","0.52","0.48","0.48","0.47","0.47","0.46","0.45","0.44","0.44"]; //sum: 10.35
engTrigrams = ["the","ion","tio","her","ate","tha","ati","for","hat","his","res","ill"];
engTrigramChance = ["3.54","0.76","0.76","0.69","0.67","0.63","0.61","0.61","0.56","0.53","0.51","0.48"]; //sum: 10.35
gerTrigramFrequency = 0;
for(iiii=0; iiii<gerTrigrams.length; iiii++){
if(text.includes(gerTrigrams[iiii])){
gerTrigramFrequency = parseFloat(gerTrigramFrequency) + parseFloat(gerTrigramChance[iiii]);
}
}
engTrigramFrequency = 0;
for(iiii=0; iiii<engTrigrams.length; iiii++){
if(text.includes(engTrigrams[iiii])){
engTrigramFrequency = parseFloat(engTrigramFrequency) + parseFloat(engTrigramChance[iiii]);
}
}
if(gerTrigramFrequency > engTrigramFrequency){
indicator.push("german");
indicatorReliability.push( (gerTrigramFrequency/(gerTrigramFrequency+engTrigramFrequency))*100 );
indicatorType.push("Trigram-Comparison");
}else if(engTrigramFrequency > gerTrigramFrequency){
indicator.push("english");
indicatorReliability.push( (engTrigramFrequency/(gerTrigramFrequency+engTrigramFrequency))*100 );
indicatorType.push("Trigram-Comparison");
}
//FINAL MEASURE
germanIndicator = 0;
englishIndicator = 0;
if(indicator.length>0){
for(iiii = 0; iiii < indicator.length; iiii++){
if(indicator[iiii] == "german"){
germanIndicator = parseFloat(germanIndicator) + parseFloat(indicatorReliability[iiii]);
englishIndicator = parseFloat(englishIndicator) + (100-parseFloat(indicatorReliability[iiii]));
}else{
englishIndicator = parseFloat(englishIndicator) + parseFloat(indicatorReliability[iiii]);
germanIndicator = parseFloat(germanIndicator) + (100-parseFloat(indicatorReliability[iiii]));
}
}
indicatorSum = parseFloat(germanIndicator) + parseFloat(englishIndicator);
germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;
if(germanIndicator > englishIndicator){
result = "german" + germanIndicator;
}else if(englishIndicator > germanIndicator){
result = "english" + englishIndicator;
}else{
result = "undefined";
}
}else{
result = "undefined";
}
results = "ANALYSIS-RESULTS (" + text + "): ";
if(indicator.length>0){
for(iiii = 0; iiii < indicator.length; iiii++){
results = results + indicatorType[iiii] + ":" + indicator[iiii] + "(" + indicatorReliability[iiii] + "%); ";
}
}
console.log(results);
return result;
}
1 ответ
Я использую функцию identifierLanguage() для вызова languageAnalysis(). Основываясь на результатах всего сообщения, анализ может быть выполнен снова, на этот раз с каждым отдельным предложением сообщения, чтобы получить лучший конечный результат.
function identifyLanguage(message){
messageResult = languageAnalysis(message);
messageReliability = 0;
if(messageResult.includes("german")){
messageReliability = messageResult.replace("german","");
messageLanguage = "german";
}else if(messageResult.includes("english")){
messageReliability = messageResult.replace("english","");
messageLanguage = "english";
}
if(parseFloat(messageReliability) < 70){
if(messageResult.includes("undefined")){
console.warn("undefined");
}else{
console.warn(messageLanguage + " (" + messageReliability + "% reliable)");
}
sentences = message.replace(/[?]/g,"?§").replace(/[!]/g,"!§").replace(/[.]/g,".§").replace(/[:]/g,":§");
sentences = sentences.split("§");
sentencesCount = sentences.length-1;
if(sentencesCount > 1){
sentencesLanguage = [];
sentencesReliability = [];
for(iiiix=0; iiiix<sentencesCount; iiiix++){
sentenceToAnalyse = sentences[iiiix];
while(sentenceToAnalyse.charAt(0) === " ") {
sentenceToAnalyse = sentenceToAnalyse.substr(1);
}
sentenceResult = languageAnalysis(sentenceToAnalyse);
if(sentenceResult.includes("german")){
sentenceReliability = sentenceResult.replace("german","")
sentencesReliability.push(sentenceReliability);
sentencesLanguage.push("german");
console.log("german (" + sentenceResult.replace("german","") + "% reliable)");
}else if(sentenceResult.includes("english")){
sentenceReliability = sentenceResult.replace("english","")
sentencesReliability.push(sentenceReliability);
sentencesLanguage.push("english");
console.log("english (" + sentenceResult.replace("english","") + "% reliable)");
}else{
console.log("undefined");
}
}
//FINAL MEASURE (IF SINGLE SENTENCES CHECKED)
germanIndicator = 0;
englishIndicator = 0;
if(sentencesLanguage.length>0){
for(iiii = 0; iiii < sentencesLanguage.length; iiii++){
if(sentencesLanguage[iiii] == "german"){
germanIndicator = parseFloat(germanIndicator) + parseFloat(sentencesReliability[iiii]);
englishIndicator = parseFloat(englishIndicator) + (100-parseFloat(sentencesReliability[iiii]));
}else{
englishIndicator = parseFloat(englishIndicator) + parseFloat(sentencesReliability[iiii]);
germanIndicator = parseFloat(germanIndicator) + (100-parseFloat(sentencesReliability[iiii]));
}
}
indicatorSum = parseFloat(germanIndicator) + parseFloat(englishIndicator);
germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;
if(germanIndicator > englishIndicator){
if(messageResult.includes("undefined")){
language = "german (" + germanIndicator + "% reliable)";
}else if(messageResult.includes("german")){
germanIndicator = (parseFloat(germanIndicator) + parseFloat(messageReliability))/2;
language = "german (" + germanIndicator + "% reliable)";
}else if(messageResult.includes("english")){
indicatorSum = parseFloat(germanIndicator) + parseFloat(messageReliability);
germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;
if(germanIndicator > englishIndicator){
language = "german (" + germanIndicator + "% reliable)";
}else if(englishIndicator > germanIndicator){
language = "english (" + englishIndicator + "% reliable)";
}else{
language = "undefined";
}
}
}else if(englishIndicator > germanIndicator){
if(messageResult.includes("undefined")){
language = "english (" + englishIndicator + "% reliable)";
}else if(messageResult.includes("english")){
englishIndicator = (parseFloat(englishIndicator) + parseFloat(messageReliability))/2;
language = "english (" + englishIndicator + "% reliable)";
}else if(messageResult.includes("german")){
indicatorSum = parseFloat(germanIndicator) + parseFloat(messageReliability);
germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;
if(germanIndicator > englishIndicator){
language = "german (" + germanIndicator + "% reliable)";
}else if(englishIndicator > germanIndicator){
language = "english (" + englishIndicator + "% reliable)";
}else{
language = "undefined";
}
}
}else{
language = "undefined";
}
}else{
language = "undefined";
}
}else{
if(messageResult.includes("undefined")){
language = "undefined";
}else{
language = messageLanguage + " (" + messageReliability + "% reliable)";
}
}
}else{
language = messageLanguage + " (" + messageReliability + "% reliable)";
}
console.info(language);
return language;
}
message = "Hello you! How are you?";
alert( identifyLanguage(message) );