如何解决 'lengths' argument should be a 1D CPU int64?
How to solve 'lengths' argument should be a 1D CPU int64?
我正在研究多文本分类 LSTM 模型,但在文本的预处理过程中我遇到了一个错误,之前我没有遇到过。我认为这是更新:standfordnlp
.
代码 是我收到的错误:
模块:
# StanfordNLP
!pip install stanfordnlp
import stanfordnlp
stanfordnlp.download('es', confirm_if_exists = True, version = 'latest')
stNLP = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma', lang='es', treebank = 'es_ancora', use_gpu=True)
# SpaCy
!spacy download es_core_news_sm # sm md
import spacy
spNLP = spacy.load('es_core_news_sm') #sm md
activated = spacy.prefer_gpu()
spacy.require_gpu()
import pandas as pd
import numpy as np
去除停用词:
def get_stop_words():
# Getting in a list all the stopwords of the dataframe with is_stop() from SpaCy
spacy_stop_words = list(dict.fromkeys([str(i) for i in spNLP(' '.join([elem for elem in new_df['descripcion']])) if i.is_stop == True]))
stop_words = stopwords.words('spanish') # defining the language
stop_words.extend(spec_stopwords) # extending the specific stopwords
stop_words.extend(spacy_stop_words) # extending the spacy stopwords
stop_words = set(stop_words)
return stop_words
stop_words = get_stop_words() # defining the stop_words set in a variable to better understanding whe applying on the dataframe
# Applying stopwords on the dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
词形还原:
def stanford_lemma(text):
doc = stNLP(text)
return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
# Lemmatization of dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: stanford_lemma(x))
# Getting new stop_words after lemmatization
get_stop_words()
# applying new stop_words on the dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: ' '.join(
[word for word in x.split() if word not in stop_words]))
回溯:
RuntimeError Traceback (most recent call last)
<ipython-input-18-60972fc225b2> in <module>()
----> 1 new_df['descripcion'] = new_df['descripcion'].apply(lambda x: stanford_lemma(x))
2
3 # Getting new stop_words after lemmatization (Lemmatizing: personalidades, personlidad = stopword)
4 get_stop_words()
5
9 frames
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
/usr/local/lib/python3.6/dist-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first, enforce_sorted)
231
232 data, batch_sizes = \
--> 233 _VF._pack_padded_sequence(input, lengths, batch_first)
234 return PackedSequence(data, batch_sizes, sorted_indices, None)
235
RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor
更新: 使用新库 Stanza
,我遇到了同样的问题。即使我尝试 引理示例用法:
,问题仍然存在
!pip install stanza
import stanza
stanza.download('es', package='ancora', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
lang='es',
use_gpu=True)
doc = nlp('Barack Obama nació en Hawaii.')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')
请求:
数据集 (@Crystina
): new_df
将考虑任何改进问题的建议。
只有在不加载 spacy 的情况下使用 Stanza 才有效。
这是加载 spacy.prefer_gpu()
时出现的错误
所以不要在单个 GPU 处理中加载这两个库。分开使用。
我正在研究多文本分类 LSTM 模型,但在文本的预处理过程中我遇到了一个错误,之前我没有遇到过。我认为这是更新:standfordnlp
.
代码 是我收到的错误:
模块:
# StanfordNLP
!pip install stanfordnlp
import stanfordnlp
stanfordnlp.download('es', confirm_if_exists = True, version = 'latest')
stNLP = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma', lang='es', treebank = 'es_ancora', use_gpu=True)
# SpaCy
!spacy download es_core_news_sm # sm md
import spacy
spNLP = spacy.load('es_core_news_sm') #sm md
activated = spacy.prefer_gpu()
spacy.require_gpu()
import pandas as pd
import numpy as np
去除停用词:
def get_stop_words():
# Getting in a list all the stopwords of the dataframe with is_stop() from SpaCy
spacy_stop_words = list(dict.fromkeys([str(i) for i in spNLP(' '.join([elem for elem in new_df['descripcion']])) if i.is_stop == True]))
stop_words = stopwords.words('spanish') # defining the language
stop_words.extend(spec_stopwords) # extending the specific stopwords
stop_words.extend(spacy_stop_words) # extending the spacy stopwords
stop_words = set(stop_words)
return stop_words
stop_words = get_stop_words() # defining the stop_words set in a variable to better understanding whe applying on the dataframe
# Applying stopwords on the dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
词形还原:
def stanford_lemma(text):
doc = stNLP(text)
return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
# Lemmatization of dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: stanford_lemma(x))
# Getting new stop_words after lemmatization
get_stop_words()
# applying new stop_words on the dataframe
new_df['descripcion'] = new_df['descripcion'].apply(lambda x: ' '.join(
[word for word in x.split() if word not in stop_words]))
回溯:
RuntimeError Traceback (most recent call last)
<ipython-input-18-60972fc225b2> in <module>()
----> 1 new_df['descripcion'] = new_df['descripcion'].apply(lambda x: stanford_lemma(x))
2
3 # Getting new stop_words after lemmatization (Lemmatizing: personalidades, personlidad = stopword)
4 get_stop_words()
5
9 frames
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
/usr/local/lib/python3.6/dist-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first, enforce_sorted)
231
232 data, batch_sizes = \
--> 233 _VF._pack_padded_sequence(input, lengths, batch_first)
234 return PackedSequence(data, batch_sizes, sorted_indices, None)
235
RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor
更新: 使用新库 Stanza
,我遇到了同样的问题。即使我尝试 引理示例用法:
!pip install stanza
import stanza
stanza.download('es', package='ancora', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
lang='es',
use_gpu=True)
doc = nlp('Barack Obama nació en Hawaii.')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')
请求:
数据集 (@Crystina
): new_df
将考虑任何改进问题的建议。
只有在不加载 spacy 的情况下使用 Stanza 才有效。
这是加载 spacy.prefer_gpu()
时出现的错误
所以不要在单个 GPU 处理中加载这两个库。分开使用。