从字符串中提取可点击的词并包含标点符号

Question

我有一个句子和句子中的一系列可点击单词。数组不包含标点符号。

这是一句话

我们在后备箱里放了两根杆子、一罐虫子、一袋三明治和一热水瓶。 “我们要去旅行了，”我父亲说。 “去一个秘密的地方。我们会赶上空气！我们会抓住 breeze！”

这是可点击词的结构。它是一个数组，包含单词在句子中开始和结束位置的索引。 This Array不包含句子中的标点符号

标点符号不可点击。

"tokens": [
            {
              "position": [
                0,
                4
              ],
              "value": "into"
            },
            {
              "position": [
                5,
                8
              ],
              "value": "the"
            },
            {
              "position": [
                9,
                14
              ],
              "value": "trunk"
            },
            {
              "position": [
                15,
                17
              ],
              "value": "we"
            },
            {
              "position": [
                18,
                21
              ],
              "value": "put"
            },
            {
              "position": [
                22,
                25
              ],
              "value": "two"
            },
            {
              "position": [
                26,
                31
              ],
              "value": "poles"
            },
            {
              "position": [
                32,
                35
              ],
              "value": "and"
            },
            {
              "position": [
                36,
                39
              ],
              "value": "the"
            },
            {
              "position": [
                40,
                43
              ],
              "value": "can"
            },
            {
              "position": [
                44,
                46
              ],
              "value": "of"
            },
            {
              "position": [
                47,
                52
              ],
              "value": "worms"
            },
            {
              "position": [
                53,
                56
              ],
              "value": "and"
            },
            {
              "position": [
                57,
                58
              ],
              "value": "a"
            },
            {
              "position": [
                59,
                63
              ],
              "value": "sack"
            },
            {
              "position": [
                64,
                66
              ],
              "value": "of"
            },
            {
              "position": [
                67,
                77
              ],
              "value": "sandwiches"
            },
            {
              "position": [
                78,
                81
              ],
              "value": "and"
            },
            {
              "position": [
                82,
                83
              ],
              "value": "a"
            },
            {
              "position": [
                84,
                91
              ],
              "value": "thermos"
            },
            {
              "position": [
                92,
                94
              ],
              "value": "of"
            },
            {
              "position": [
                95,
                100
              ],
              "value": "water"
            },
            {
              "position": [
                103,
                108
              ],
              "value": "we're"
            },
            {
              "position": [
                109,
                114
              ],
              "value": "going"
            },
            {
              "position": [
                115,
                117
              ],
              "value": "on"
            },
            {
              "position": [
                118,
                119
              ],
              "value": "a"
            },
            {
              "position": [
                120,
                127
              ],
              "value": "journey"
            },
            {
              "position": [
                130,
                132
              ],
              "value": "my"
            },
            {
              "position": [
                133,
                139
              ],
              "value": "father"
            },
            {
              "position": [
                140,
                144
              ],
              "value": "said"
            },
            {
              "position": [
                147,
                149
              ],
              "value": "to"
            },
            {
              "position": [
                150,
                151
              ],
              "value": "a"
            },
            {
              "position": [
                152,
                158
              ],
              "value": "secret"
            },
            {
              "position": [
                159,
                164
              ],
              "value": "place"
            },
            {
              "position": [
                166,
                171
              ],
              "value": "we'll"
            },
            {
              "position": [
                172,
                177
              ],
              "value": "catch"
            },
            {
              "position": [
                178,
                181
              ],
              "value": "the"
            },
            {
              "position": [
                182,
                185
              ],
              "value": "air"
            },
            {
              "position": [
                187,
                192
              ],
              "value": "we'll"
            },
            {
              "position": [
                193,
                198
              ],
              "value": "catch"
            },
            {
              "position": [
                199,
                202
              ],
              "value": "the"
            },
            {
              "position": [
                203,
                209
              ],
              "value": "breeze"
            }
          ]
        },

这是我获取可点击字词的代码

 const getWordsFromTokens = tokens.reduce((words, token)=>{
   let start = token.position[0]; //Start is the first character of the token value in the sentence
   let end = token.position[1]; // end is the last character of the token value in the sentence

   let diffrenceBetweenLastPositionAndFirst = end+(end-start); 
   
    /* You get punctuationMarks or any characters not in the Tokens by getting the string between 
        the end and diffrence between the end and start
    */
   let punctuationMarks = content.substring(end, (diffrenceBetweenLastPositionAndFirst)); 
   
   console.log(punctuationMarks);

   words.push( content.substring(start, end)+punctuationMarks); //concat with any space of pucntuation mark after the word.
   return words; //<- return this to be used in next round of reduce untill all words are
  },[]);

这是我呈现文本的方式

return (
    <div>
      <p> {
        getWordsFromTokens.map((word, index)=>{
         return <a href={'/word/' + word} > {word}</a>
        })
      }
      </p>
    </div>
  )

这是我的问题，当我渲染文本时，它看起来与原始文本不完全一样。我可能做错了什么？

这是最终结果的样子

我们在 tr 后备箱里放了两个电线杆和一罐 w 条虫子和一袋 s 三明治和一个热水瓶和热水瓶水。 “我们要去旅行了，”我父亲说。说。 “去一个秘密的地方。我们'我们会赶上 ai 空气！ W 我们会抓住 br breeze！”

Answer 1

这样的解决方案怎么样？我使用光标来跟踪句子中的位置。

const tokens = [{
    "position": [
      0,
      4
    ],
    "value": "into"
  },
  {
    "position": [
      5,
      8
    ],
    "value": "the"
  },
  {
    "position": [
      9,
      14
    ],
    "value": "trunk"
  },
  {
    "position": [
      15,
      17
    ],
    "value": "we"
  },
  {
    "position": [
      18,
      21
    ],
    "value": "put"
  },
  {
    "position": [
      22,
      25
    ],
    "value": "two"
  },
  {
    "position": [
      26,
      31
    ],
    "value": "poles"
  },
  {
    "position": [
      32,
      35
    ],
    "value": "and"
  },
  {
    "position": [
      36,
      39
    ],
    "value": "the"
  },
  {
    "position": [
      40,
      43
    ],
    "value": "can"
  },
  {
    "position": [
      44,
      46
    ],
    "value": "of"
  },
  {
    "position": [
      47,
      52
    ],
    "value": "worms"
  },
  {
    "position": [
      53,
      56
    ],
    "value": "and"
  },
  {
    "position": [
      57,
      58
    ],
    "value": "a"
  },
  {
    "position": [
      59,
      63
    ],
    "value": "sack"
  },
  {
    "position": [
      64,
      66
    ],
    "value": "of"
  },
  {
    "position": [
      67,
      77
    ],
    "value": "sandwiches"
  },
  {
    "position": [
      78,
      81
    ],
    "value": "and"
  },
  {
    "position": [
      82,
      83
    ],
    "value": "a"
  },
  {
    "position": [
      84,
      91
    ],
    "value": "thermos"
  },
  {
    "position": [
      92,
      94
    ],
    "value": "of"
  },
  {
    "position": [
      95,
      100
    ],
    "value": "water"
  },
  {
    "position": [
      103,
      108
    ],
    "value": "we're"
  },
  {
    "position": [
      109,
      114
    ],
    "value": "going"
  },
  {
    "position": [
      115,
      117
    ],
    "value": "on"
  },
  {
    "position": [
      118,
      119
    ],
    "value": "a"
  },
  {
    "position": [
      120,
      127
    ],
    "value": "journey"
  },
  {
    "position": [
      130,
      132
    ],
    "value": "my"
  },
  {
    "position": [
      133,
      139
    ],
    "value": "father"
  },
  {
    "position": [
      140,
      144
    ],
    "value": "said"
  },
  {
    "position": [
      147,
      149
    ],
    "value": "to"
  },
  {
    "position": [
      150,
      151
    ],
    "value": "a"
  },
  {
    "position": [
      152,
      158
    ],
    "value": "secret"
  },
  {
    "position": [
      159,
      164
    ],
    "value": "place"
  },
  {
    "position": [
      166,
      171
    ],
    "value": "we'll"
  },
  {
    "position": [
      172,
      177
    ],
    "value": "catch"
  },
  {
    "position": [
      178,
      181
    ],
    "value": "the"
  },
  {
    "position": [
      182,
      185
    ],
    "value": "air"
  },
  {
    "position": [
      187,
      192
    ],
    "value": "we'll"
  },
  {
    "position": [
      193,
      198
    ],
    "value": "catch"
  },
  {
    "position": [
      199,
      202
    ],
    "value": "the"
  },
  {
    "position": [
      203,
      209
    ],
    "value": "breeze"
  }
];

const content = 'Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!"';

let cursorPosition = 0; // set a variable to track the position of cursor

const getWordsFromTokens = tokens.reduce((words, token) => {
  let tokenStart = token.position[0]; //Start is the first character of the token value in the sentence
  let tokenEnd = token.position[1]; // end is the last character of the token value in the sentence

  let notWordBeforeThisWord = content.substring(cursorPosition, tokenStart); // get the non-word characters (spaces, punctuation) before the current word

  let tokenValue = content.substring(tokenStart, tokenEnd);; // the word value

  words.push({
    type: 'non-word',
    value: notWordBeforeThisWord
  }, {
    type: 'word',
    value: tokenValue
  }); //concat with any space of pucntuation mark after the word.

  cursorPosition = tokenEnd; // update the cursor position

  return words; // return this to be used in next round of reduce untill all words are
}, []);

getWordsFromTokens.forEach(item => {
  const htmlToAppend = item.type === 'word' ?
    `<a href='/word/${item.value}'>${item.value}</a>` :
    item.value

  document.getElementById('new-sentence').innerHTML += htmlToAppend;
})

const endOfSentence = content.substring(cursorPosition); // get all carachters (if any) after the last token

document.getElementById('new-sentence').innerHTML = document.getElementById('new-sentence').innerHTML + endOfSentence;

<p id='new-sentence'></p>

Answer 2

我认为使用 RegExp 会让您的生活更轻松：

const content = `Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. "We're going on a journey," my father said. "To a secret place. We'll catch the air! We'll catch the breeze!`;
const result = content.match(/([\w'])+|([\.;,:-_?!"]+[\s"]*["]*)/gim);
console.log(result);
const punctuation = /[\.;,:\-_?!"]+/;

 function App() {
    return (
    <div>
      {result.map((w) =>
        punctuation.test(w) ? w : <a href={`/word/${w}`}>{w + '\n'}</a>
      )}
    </div>
  );
}

ReactDOM.render(<App/>, document.getElementById("root"))

<div id="root"></div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react/16.6.3/umd/react.production.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react-dom/16.6.3/umd/react-dom.production.min.js"></script>

从字符串中提取可点击的词并包含标点符号

Extract Clickable words from String and Include Punctuation Marks

javascript

node.js

higher-order-functions

reactjs