列出超出范围的索引以从 df 列中提取文本行
list index out of range to extract text lines from a df column
我终于发现需要您的指导和支持,因为我在下一段代码中没有检测到我的错误。
假设当您将计数器不正确地初始化为 df 的长度时,“列表索引超出范围”会上升,但我正在尝试的是 return 列的第一行 Descripción
作为示例 (doc) 应用 NLTK 停用词分析。
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
pd.set_option('display.max_columns', None)
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
from nltk.stem import WordNetLemmatizer
import string
import base64
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import sklearn.feature_extraction.stop_words
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import spacy
spacy.load('es_core_news_sm')
from spacy.lang.es import Spanish
parser = Spanish()
df = pd.read_csv('geografia_empleos_MX.csv')
df.head(2)
del df['Unnamed: 0']
df.head(1)
df.isnull().sum()
df1 = df.copy()
df1['fraudulento'].value_counts()
import spacy
nlp = spacy.load('es_core_news_lg')
stopwords = stopwords.words('spanish')
punctuations = string.punctuation
def limpia_texto(docs, logging = False):
texts = []
counter = 1
for doc in docs:
if counter % 100 == 0 and logging:
print('Procesados: {} de {} documentos'.format(counter, len(docs)))
counter += 1
doc = nlp(doc, disable = ['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
Falso_1[10] # Here is when index error raises :(
Falso_1 不包含 10 个索引,这就是它引发错误的原因。此行正在从您的数据框中收集列。
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
你应该用更像pandas的方式替换它:
Falso_1 = df1.loc[df1['fraudulento'] == 1, 'Descripción'].to_numpy()
Falso_1 .shape
Falso_1.shape
会给你其中的索引数
我终于发现需要您的指导和支持,因为我在下一段代码中没有检测到我的错误。
假设当您将计数器不正确地初始化为 df 的长度时,“列表索引超出范围”会上升,但我正在尝试的是 return 列的第一行 Descripción
作为示例 (doc) 应用 NLTK 停用词分析。
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
pd.set_option('display.max_columns', None)
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
from nltk.stem import WordNetLemmatizer
import string
import base64
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import sklearn.feature_extraction.stop_words
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import spacy
spacy.load('es_core_news_sm')
from spacy.lang.es import Spanish
parser = Spanish()
df = pd.read_csv('geografia_empleos_MX.csv')
df.head(2)
del df['Unnamed: 0']
df.head(1)
df.isnull().sum()
df1 = df.copy()
df1['fraudulento'].value_counts()
import spacy
nlp = spacy.load('es_core_news_lg')
stopwords = stopwords.words('spanish')
punctuations = string.punctuation
def limpia_texto(docs, logging = False):
texts = []
counter = 1
for doc in docs:
if counter % 100 == 0 and logging:
print('Procesados: {} de {} documentos'.format(counter, len(docs)))
counter += 1
doc = nlp(doc, disable = ['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
Falso_1[10] # Here is when index error raises :(
Falso_1 不包含 10 个索引,这就是它引发错误的原因。此行正在从您的数据框中收集列。
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
你应该用更像pandas的方式替换它:
Falso_1 = df1.loc[df1['fraudulento'] == 1, 'Descripción'].to_numpy()
Falso_1 .shape
Falso_1.shape
会给你其中的索引数