remove cheerio and add custom html parser

This commit is contained in:
ewwwwwwww 2022-07-04 21:27:17 -07:00
parent a2fab1285d
commit 9d0a3b7a69
3 changed files with 57 additions and 162 deletions

View file

@ -1,10 +1,5 @@
// import data from '@emoji-mart/data';
import { load as cheerioLoad } from 'cheerio';
import { parseDocument } from 'htmlparser2';
import unicodeMapping from './mapping';
import type { Node as CheerioNode } from 'cheerio';
import type { Emoji as EmojiMart, CustomEmoji as EmojiMartCustom } from 'emoji-mart';
/*
@ -99,24 +94,24 @@ const popStack = (stack: string, open: boolean) => {
// TODO: handle grouped unicode emojis
export const emojifyText = (str: string, customEmojis = {}) => {
let res = '';
let buf = '';
let stack = '';
let open = false;
for (const c of Array.from(str)) { // chunk by unicode codepoint with Array.from
if (c in unicodeMapping) {
if (open) { // unicode emoji inside colon
res += popStack(stack, open);
buf += popStack(stack, open);
}
res += convertUnicode(c);
buf += convertUnicode(c);
} else if (c === ':') {
stack += ':';
// we see another : we convert it and clear the stack buffer
if (open) {
res += convertEmoji(stack, customEmojis);
buf += convertEmoji(stack, customEmojis);
stack = '';
}
@ -128,54 +123,75 @@ export const emojifyText = (str: string, customEmojis = {}) => {
// if the stack is non-null and we see invalid chars it's a string not emoji
// so we push it to the return result and clear it
if (!validEmojiChar(c)) {
res += popStack(stack, open);
buf += popStack(stack, open);
}
} else {
res += c;
buf += c;
}
}
}
// never found a closing colon so it's just a raw string
if (open) {
res += stack;
buf += stack;
}
return res;
return buf;
};
// const parseHmtl = (str: string) => {
// const ret = [];
// let depth = 0;
//
// return ret;
// }
const parseHTML = (str: string): { text: boolean, data: string }[] => {
const tokens = [];
let buf = '';
let stack = '';
let open = false;
const filterTextNodes = (idx: number, el: CheerioNode) => {
return el.nodeType === Node.TEXT_NODE;
for (const c of str) {
if (c === '<') {
if (open) {
tokens.push({ text: true, data: stack });
stack = '<';
} else {
tokens.push({ text: true, data: buf });
stack = '<';
open = true;
}
} else if (c === '>') {
if (open) {
open = false;
tokens.push({ text: false, data: stack + '>' });
stack = '';
buf = '';
} else {
buf += '>';
}
} else {
if (open) {
stack += c;
} else {
buf += c;
}
}
}
if (open) {
tokens.push({ text: true, data: buf + stack });
} else if (buf !== '') {
tokens.push({ text: true, data: buf });
}
return tokens;
};
const emojify = (str: string, customEmojis = {}) => {
const dom = parseDocument(str);
const $ = cheerioLoad(dom, {
xmlMode: true,
decodeEntities: false,
});
return parseHTML(str)
.map(({ text, data }) => {
if (!text) return data;
if (data.length === 0 || data === ' ') return data;
$.root()
.contents() // iterate over flat map of all html elements
.filter(filterTextNodes) // only iterate over text nodes
.each((idx, el) => {
// skip common case
// @ts-ignore
if (el.data.length === 0 || el.data === ' ') return;
// mutating el.data is incorrect but we do it to prevent a second dom parse
// @ts-ignore
el.data = emojifyText(el.data, customEmojis);
});
return $.html();
return emojifyText(data, customEmojis);
})
.join('');
};
export default emojify;

View file

@ -104,7 +104,6 @@
"bootstrap-icons": "^1.5.0",
"bowser": "^2.11.0",
"browserslist": "^4.16.6",
"cheerio": "^1.0.0-rc.12",
"classnames": "^2.2.5",
"copy-webpack-plugin": "^9.0.1",
"core-js": "^3.15.2",
@ -125,7 +124,6 @@
"history": "^4.10.1",
"html-webpack-harddisk-plugin": "^2.0.0",
"html-webpack-plugin": "^5.5.0",
"htmlparser2": "^8.0.1",
"http-link-header": "^1.0.2",
"immutable": "^4.0.0",
"imports-loader": "^4.0.0",

121
yarn.lock
View file

@ -1271,13 +1271,6 @@
dependencies:
regenerator-runtime "^0.12.0"
"@babel/runtime@^7.0.0":
version "7.18.6"
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.18.6.tgz#6a1ef59f838debd670421f8c7f2cbb8da9751580"
integrity sha512-t9wi7/AW6XtKahAe20Yw0/mMljKq0B1r2fPdvaAdV/KPDZewFXdaaa6K7lxmZBZ8FBNpCiAT6iHPmd6QO9bKfQ==
dependencies:
regenerator-runtime "^0.13.4"
"@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.11.2", "@babel/runtime@^7.12.1", "@babel/runtime@^7.2.0", "@babel/runtime@^7.8.4", "@babel/runtime@^7.9.2":
version "7.15.4"
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.15.4.tgz#fd17d16bfdf878e6dd02d19753a39fa8a8d9c84a"
@ -4079,31 +4072,6 @@ charcodes@^0.2.0:
resolved "https://registry.yarnpkg.com/charcodes/-/charcodes-0.2.0.tgz#5208d327e6cc05f99eb80ffc814707572d1f14e4"
integrity sha512-Y4kiDb+AM4Ecy58YkuZrrSRJBDQdQ2L+NyS1vHHFtNtUjgutcZfx3yp1dAONI/oPaPmyGfCLx5CxL+zauIMyKQ==
cheerio-select@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4"
integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==
dependencies:
boolbase "^1.0.0"
css-select "^5.1.0"
css-what "^6.1.0"
domelementtype "^2.3.0"
domhandler "^5.0.3"
domutils "^3.0.1"
cheerio@^1.0.0-rc.12:
version "1.0.0-rc.12"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683"
integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==
dependencies:
cheerio-select "^2.1.0"
dom-serializer "^2.0.0"
domhandler "^5.0.3"
domutils "^3.0.1"
htmlparser2 "^8.0.1"
parse5 "^7.0.0"
parse5-htmlparser2-tree-adapter "^7.0.0"
"chokidar@>=3.0.0 <4.0.0":
version "3.5.2"
resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.2.tgz#dba3976fcadb016f66fd365021d91600d01c1e75"
@ -4496,17 +4464,6 @@ css-select@^4.1.3:
domutils "^2.6.0"
nth-check "^2.0.0"
css-select@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
dependencies:
boolbase "^1.0.0"
css-what "^6.1.0"
domhandler "^5.0.2"
domutils "^3.0.1"
nth-check "^2.0.1"
css-system-font-keywords@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/css-system-font-keywords/-/css-system-font-keywords-1.0.0.tgz#85c6f086aba4eb32c571a3086affc434b84823ed"
@ -4525,11 +4482,6 @@ css-what@^5.0.0:
resolved "https://registry.yarnpkg.com/css-what/-/css-what-5.0.1.tgz#3efa820131f4669a8ac2408f9c32e7c7de9f4cad"
integrity sha512-FYDTSHb/7KXsWICVsxdmiExPjCfRC4qRFBdVwv7Ax9hMnvMmEjP9RfxTEZ3qPZGmADDn2vAKSo9UcN1jKVYscg==
css-what@^6.1.0:
version "6.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
css.escape@^1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/css.escape/-/css.escape-1.5.1.tgz#42e27d4fa04ae32f931a4b4d4191fa9cddee97cb"
@ -4952,15 +4904,6 @@ dom-serializer@^1.0.1:
domhandler "^4.2.0"
entities "^2.0.0"
dom-serializer@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53"
integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==
dependencies:
domelementtype "^2.3.0"
domhandler "^5.0.2"
entities "^4.2.0"
domelementtype@1, domelementtype@^1.3.1:
version "1.3.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
@ -4971,11 +4914,6 @@ domelementtype@^2.0.1, domelementtype@^2.2.0:
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57"
integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A==
domelementtype@^2.3.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d"
integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==
domexception@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90"
@ -5004,13 +4942,6 @@ domhandler@^4.0.0, domhandler@^4.2.0:
dependencies:
domelementtype "^2.2.0"
domhandler@^5.0.1, domhandler@^5.0.2, domhandler@^5.0.3:
version "5.0.3"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
dependencies:
domelementtype "^2.3.0"
domutils@^1.5.1:
version "1.7.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.7.0.tgz#56ea341e834e06e6748af7a1cb25da67ea9f8c2a"
@ -5028,15 +4959,6 @@ domutils@^2.5.2, domutils@^2.6.0:
domelementtype "^2.2.0"
domhandler "^4.2.0"
domutils@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.0.1.tgz#696b3875238338cb186b6c0612bd4901c89a4f1c"
integrity sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==
dependencies:
dom-serializer "^2.0.0"
domelementtype "^2.3.0"
domhandler "^5.0.1"
dot-case@^3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/dot-case/-/dot-case-3.0.4.tgz#9b2b670d00a431667a8a75ba29cd1b98809ce751"
@ -5102,15 +5024,6 @@ emoji-datasource@5.0.0:
resolved "https://registry.yarnpkg.com/emoji-datasource/-/emoji-datasource-5.0.0.tgz#1522fdba3c52223a1cf5a1c1fc282935400eaa06"
integrity sha512-LuvLWFnxznTH++GytEzpzOPUo1SB+6CUFqIlVETJJ3x9fpyMCKFfyqberbhMLOpT1qcNe+km+zoyBeUSC3u5Rw==
"emoji-mart-old@npm:emoji-mart-lazyload":
version "3.0.1-j"
resolved "https://registry.yarnpkg.com/emoji-mart-lazyload/-/emoji-mart-lazyload-3.0.1-j.tgz#87a90d30b79d9145ece078d53e3e683c1a10ce9c"
integrity sha512-0wKF7MR0/iAeCIoiBLY+JjXCugycTgYRC2SL0y9/bjNSQlbeMdzILmPQJAufU/mgLFDUitOvjxLDhOZ9yxZ48g==
dependencies:
"@babel/runtime" "^7.0.0"
intersection-observer "^0.12.0"
prop-types "^15.6.0"
emoji-mart@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/emoji-mart/-/emoji-mart-5.1.0.tgz#8a36a872e1297747342d1385bd7b7141ac2f4365"
@ -5181,7 +5094,7 @@ entities@^2.0.0:
resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
entities@^4.2.0, entities@^4.3.0, entities@^4.3.1:
entities@^4.3.1:
version "4.3.1"
resolved "https://registry.yarnpkg.com/entities/-/entities-4.3.1.tgz#c34062a94c865c322f9d67b4384e4169bcede6a4"
integrity sha512-o4q/dYJlmyjP2zfnaWDUC6A3BQFmVTX+tZPezK7k0GLSU9QYCauscf5Y+qcEPzKL+EixVouYDgLQK5H9GrLpkg==
@ -6463,16 +6376,6 @@ htmlparser2@^6.1.0:
domutils "^2.5.2"
entities "^2.0.0"
htmlparser2@^8.0.1:
version "8.0.1"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.1.tgz#abaa985474fcefe269bc761a779b544d7196d010"
integrity sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==
dependencies:
domelementtype "^2.3.0"
domhandler "^5.0.2"
domutils "^3.0.1"
entities "^4.3.0"
http-deceiver@^1.2.7:
version "1.2.7"
resolved "https://registry.yarnpkg.com/http-deceiver/-/http-deceiver-1.2.7.tgz#fa7168944ab9a519d337cb0bec7284dc3e723d87"
@ -8705,13 +8608,6 @@ nth-check@^2.0.0:
dependencies:
boolbase "^1.0.0"
nth-check@^2.0.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d"
integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==
dependencies:
boolbase "^1.0.0"
num2fraction@^1.2.2:
version "1.2.2"
resolved "https://registry.yarnpkg.com/num2fraction/-/num2fraction-1.2.2.tgz#6f682b6a027a4e9ddfa4564cd2589d1d4e669ede"
@ -9016,26 +8912,11 @@ parse-passwd@^1.0.0:
resolved "https://registry.yarnpkg.com/parse-passwd/-/parse-passwd-1.0.0.tgz#6d5b934a456993b23d37f40a382d6f1666a8e5c6"
integrity sha512-1Y1A//QUXEZK7YKz+rD9WydcE1+EuPr6ZBgKecAB8tmoW6UFv0NREVJe1p+jRxtThkcbbKkfwIbWJe/IeE6m2Q==
parse5-htmlparser2-tree-adapter@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz#23c2cc233bcf09bb7beba8b8a69d46b08c62c2f1"
integrity sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==
dependencies:
domhandler "^5.0.2"
parse5 "^7.0.0"
parse5@6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-6.0.1.tgz#e1a1c085c569b3dc08321184f19a39cc27f7c30b"
integrity sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==
parse5@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.0.0.tgz#51f74a5257f5fcc536389e8c2d0b3802e1bfa91a"
integrity sha512-y/t8IXSPWTuRZqXc0ajH/UwDj4mnqLEbSttNbThcFhGrZuOyoyvNBO85PBp2jQa55wY9d07PBNjsK8ZP3K5U6g==
dependencies:
entities "^4.3.0"
parseurl@~1.3.2, parseurl@~1.3.3:
version "1.3.3"
resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4"