实现思路
基础场景
- 最初的需求就是 遇到 aaa bbb ccc aaa,第一个aaa高亮,后边的不再高亮
- 使用正则
- 满足的是最后一个高亮
- 无法满足第一个aaa高亮
- 主要是正则无法记录状态
- 不使用正则,一次 scan + Set 去重,就可以简单实现
- 专业的做法是:tokenizer → annotate → render(专业级)
- 分词(tokenize)
- 标注(annotate)
- 渲染(render)
- 基础实现
js
function tokenize(text) {
return text.split(/(\b)/);
}
export const annotated = (text, keywords)=>{
const seen = new Set();
const tokens = tokenize(text);
const res = tokens.map(token => {
if (keywords.has(token) && !seen.has(token)) {
seen.add(token);
return { type: 'highlight', value: token };
}
return { type: 'text', value: token };
});
return res
}
// const keywords = new Set(['aaa', 'bbb']); // 要高亮的词
// console.log(annotated('aaa bbb ccc ddd',keywords)){"type":"highlight","value":"aaa"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"highlight","value":"bbb"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"text","value":"ccc"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"text","value":"ddd"}
需求复杂
- 1、也考虑词汇(词组)高亮
js
function tokenize(text) {
const tokens = [];
const reg = /\w+|\s+|[^\w\s]+/g;
let m;
while ((m = reg.exec(text))) {
const value = m[0];
tokens.push({
type: /\w+/.test(value) ? 'word' : 'other',
value
});
}
return tokens;
}
export function annotated(text, phrases) {
const tokens = tokenize(text)
const seen = new Set();
const result = [];
const maxLen = Math.max(...phrases.map(p => p.length));
for (let i = 0; i < tokens.length; ) {
if (tokens[i].type !== 'word') {
result.push(tokens[i]);
i++;
continue;
}
let matched = false;
// 从长到短匹配
for (let len = maxLen; len > 0; len--) {
/*
为什么是 len * 2 - 1?
假设词组长度 = 2:
aaa␠bbb
token 序列是:[word, space, word] → 3 个 token
*/
const slice = tokens.slice(i, i + len * 2 - 1);
const words = slice.filter(t => t.type === 'word').map(t => t.value);
if (words.length !== len) continue;
const key = words.join(' ');
if (
phrases.some(p => p.join(' ') === key) &&
!seen.has(key)
) {
// 捕获到,增加高亮
seen.add(key);
result.push({
type: 'highlight',
value: slice.map(t => t.value).join('')
});
i += slice.length;
matched = true;
break;
}
}
if (!matched) {
// 没有捕获到
result.push(tokens[i]);
i++;
}
}
return result;
}{"type":"highlight","value":"aaa bbb ccc"}
{"type":"other","value":" "}
{"type":"highlight","value":"ddd"}
- 2、词组与词组混合(混合交叉)
- 3、单词与词组混合(重叠)
- 升级到了 => 文本标注引擎