Skip to content

实现思路

基础场景

  • 最初的需求就是 遇到 aaa bbb ccc aaa,第一个aaa高亮,后边的不再高亮
  • 使用正则
    • 满足的是最后一个高亮
    • 无法满足第一个aaa高亮
    • 主要是正则无法记录状态
  • 不使用正则,一次 scan + Set 去重,就可以简单实现
  • 专业的做法是:tokenizer → annotate → render(专业级)
    • 分词(tokenize)
    • 标注(annotate)
    • 渲染(render)
  • 基础实现
js
function tokenize(text) {
  return text.split(/(\b)/);
}

export const annotated = (text, keywords)=>{
  const seen = new Set();
  const tokens = tokenize(text);
  const res = tokens.map(token => {
    if (keywords.has(token) && !seen.has(token)) {
      seen.add(token);
      return { type: 'highlight', value: token };
    }
    return { type: 'text', value: token };
  });
  return res
}

// const keywords = new Set(['aaa', 'bbb']); // 要高亮的词
// console.log(annotated('aaa bbb ccc ddd',keywords))
{"type":"highlight","value":"aaa"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"highlight","value":"bbb"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"text","value":"ccc"}
{"type":"text","value":""}
{"type":"text","value":" "}
{"type":"text","value":""}
{"type":"text","value":"ddd"}

需求复杂

  • 1、也考虑词汇(词组)高亮
js
function tokenize(text) {
  const tokens = [];
  const reg = /\w+|\s+|[^\w\s]+/g;
  let m;

  while ((m = reg.exec(text))) {
    const value = m[0];
    tokens.push({
      type: /\w+/.test(value) ? 'word' : 'other',
      value
    });
  }
  return tokens;
}

export function annotated(text, phrases) {
  const tokens = tokenize(text)
  const seen = new Set();
  const result = [];
  const maxLen = Math.max(...phrases.map(p => p.length));

  for (let i = 0; i < tokens.length; ) {
    if (tokens[i].type !== 'word') {
      result.push(tokens[i]);
      i++;
      continue;
    }

    let matched = false;

    // 从长到短匹配
    for (let len = maxLen; len > 0; len--) {
      /*
        为什么是 len * 2 - 1?
        假设词组长度 = 2:
        aaa␠bbb
        token 序列是:[word, space, word] → 3 个 token
      */
      const slice = tokens.slice(i, i + len * 2 - 1);
      const words = slice.filter(t => t.type === 'word').map(t => t.value);

      if (words.length !== len) continue;

      const key = words.join(' ');
      if (
        phrases.some(p => p.join(' ') === key) &&
        !seen.has(key)
      ) {
        // 捕获到,增加高亮
        seen.add(key);
        result.push({
          type: 'highlight',
          value: slice.map(t => t.value).join('')
        });
        i += slice.length;
        matched = true;
        break;
      }
    }

    if (!matched) {
      // 没有捕获到
      result.push(tokens[i]);
      i++;
    }
  }

  return result;
}
{"type":"highlight","value":"aaa bbb ccc"}
{"type":"other","value":" "}
{"type":"highlight","value":"ddd"}
  • 2、词组与词组混合(混合交叉)
  • 3、单词与词组混合(重叠)
  • 升级到了 => 文本标注引擎

词汇高亮