'utf-8' 编解码器无法解码字节 0xb7
'utf-8' codec can't decode byte 0xb7
我通过 python3 matchtagger.py bulcmatch 使用它。
我可以在哪里匹配特定的单词并捕获句子并将输出保存在 csv 中,
代码的同一文件夹中的所有注释,
import re
import click
import time
import os
import csv
import glob
from fuzzywuzzy import fuzz, process
timestr = time.strftime("%Y%m%d-%H%M%S")
@click.group()
@click.version_option(version='0.02', prog_name='matchtagger')
def main():
"""MatchTagger CLI :compares multiple files,match them and label them as tags"""
pass
@main.command()
@click.argument('current_path')
def bulkmatch(current_path):
""" Bulk Matching of File 1 with Keywords From 3 Files
eg. matchtagger bulkmatch .
eg. python matchtagger bulkmatch .
"""
# List of Default Keywords
keywords_file = {'pradaxa', 'gemcitabine', 'rivaroxaban', 'edoxa', 'xarelto', 'riva', 'edoxaban', 'eliquis',
'dabigatran', 'apix', 'savaysa', 'dabi', 'edox', 'apixaban', 'elliquis', 'coumadin', 'apixa', 'warfarin'}
pos_keywords = {'continue', 'start', 'begin', 'use', 'remain', 'will be on'}
neg_keywords = {'discontinue', 'stop', 'hold'}
results_file = "extracted_results" + timestr + '.csv'
# Find All Files
files = glob.glob('*.txt')
for f in files:
# Loop through each file and match them
with open(os.path.join(f)) as master_file:
with open(results_file, "a+") as finalfile:
writer = csv.writer(finalfile, delimiter='|')
print('line')
for line in master_file:
print('line')
line = re.sub(r'[^\x00-\x7F]+', ' ', line)
print(line)
if set(line.lower().split()[:-1]) & keywords_file:
click.echo('Found Match:: {}'.format(line))
matched_word = process.extract(
line, keywords_file, limit=1)
click.secho(
('Match Word:: {}'.format(matched_word)), fg="yellow")
if set(line.lower().split()[:-1]) & pos_keywords:
click.secho(
('Positive::{}'.format(line)), fg="blue")
label = "Positive"
elif set(line.lower().split()[:-1]) & neg_keywords:
click.secho(
('Negative::{}'.format(line)), fg="red")
label = "Negative"
else:
label = 'Neutral'
result = '"Filename":"{}"|\n"Matched_Sentence":"{}"|\n"Label":"{}"\n'.format(
master_file.name, line, label)
print(result)
writer.writerow((f, line.strip(), label))
click.secho(
('Finished Task For: {}'.format(line)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
elif set(line.lower().split()[:-1]) not in keywords_file:
click.secho(
('None Found::{}'.format(line)), fg="green")
click.secho(('Finished Task For: {}'.format(f)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
@main.command()
@click.argument('main_file')
@click.argument('keywords')
@click.argument('positive')
@click.argument('negative')
def matchfiles(main_file, keywords, positive, negative):
""" Match File 1 with Keywords From 3 Files
eg. matchtagger matchfiles masterfile.txt keywordfile.txt positivefile.txt negativefile.txt
eg. python matchtagger matchfiles masterfile.txt keywordfile.txt positivefile.txt negativefile.txt
"""
with open(os.path.join(keywords)) as second_file:
keywords_file = set(second_file.read().lower().split())
with open(os.path.join(positive)) as third_file:
pos_keywords = set(third_file.read().lower().split())
with open(os.path.join(negative)) as fourth_file:
neg_keywords = set(fourth_file.read().lower().split())
# File Name
results_file = "extracted_results" + timestr + '.csv'
# Open and Match Files
with open(os.path.join(main_file), errors='replace') as master_file:
with open(results_file, "a+") as finalfile:
writer = csv.writer(finalfile, delimiter='|')
for line in master_file:
if set(line.lower().split()[:-1]) & keywords_file:
click.echo('Found Match:: {}'.format(line))
matched_word = process.extract(
line, keywords_file, limit=1)
click.secho(
('Match Word:: {}'.format(matched_word)), fg="yellow")
if set(line.lower().split()[:-1]) & pos_keywords:
click.secho(('Positive::{}'.format(line)), fg="blue")
label = "Positive"
if set(line.lower().split()[:-1]) & neg_keywords:
click.secho(('Negative::{}'.format(line)), fg="red")
label = "Negative"
result = '"Filename":"{}"|\n"Matched_Sentence":"{}"|\n"Label":"{}"\n'.format(
master_file.name, line, label)
print(result)
writer.writerow(
(main_file, line.strip(), label))
click.secho(
('Finished Task For: {}'.format(line)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
elif set(line.lower().split()[:-1]) not in keywords_file:
click.secho(('None Found::{}'.format(line)), fg="green")
else:
click.secho(('Neutral::{}'.format(line)), fg="green")
label = "Neutral"
click.secho(('Saved Result in File: {}'.format(results_file)), fg="blue")
if __name__ == '__main__':
main()
但是
输出
UnicodeDecodeError:'utf-8'编解码器无法解码位置 21 中的字节 0xb7:起始字节无效
是否与音符编码有关,如果是,我该如何在代码中修复它
您的文件可能与您使用的编码不同。
阅读此处了解 python link
中的不同编码
希望对您有所帮助。
我通过 python3 matchtagger.py bulcmatch 使用它。
我可以在哪里匹配特定的单词并捕获句子并将输出保存在 csv 中,
代码的同一文件夹中的所有注释,
import re
import click
import time
import os
import csv
import glob
from fuzzywuzzy import fuzz, process
timestr = time.strftime("%Y%m%d-%H%M%S")
@click.group()
@click.version_option(version='0.02', prog_name='matchtagger')
def main():
"""MatchTagger CLI :compares multiple files,match them and label them as tags"""
pass
@main.command()
@click.argument('current_path')
def bulkmatch(current_path):
""" Bulk Matching of File 1 with Keywords From 3 Files
eg. matchtagger bulkmatch .
eg. python matchtagger bulkmatch .
"""
# List of Default Keywords
keywords_file = {'pradaxa', 'gemcitabine', 'rivaroxaban', 'edoxa', 'xarelto', 'riva', 'edoxaban', 'eliquis',
'dabigatran', 'apix', 'savaysa', 'dabi', 'edox', 'apixaban', 'elliquis', 'coumadin', 'apixa', 'warfarin'}
pos_keywords = {'continue', 'start', 'begin', 'use', 'remain', 'will be on'}
neg_keywords = {'discontinue', 'stop', 'hold'}
results_file = "extracted_results" + timestr + '.csv'
# Find All Files
files = glob.glob('*.txt')
for f in files:
# Loop through each file and match them
with open(os.path.join(f)) as master_file:
with open(results_file, "a+") as finalfile:
writer = csv.writer(finalfile, delimiter='|')
print('line')
for line in master_file:
print('line')
line = re.sub(r'[^\x00-\x7F]+', ' ', line)
print(line)
if set(line.lower().split()[:-1]) & keywords_file:
click.echo('Found Match:: {}'.format(line))
matched_word = process.extract(
line, keywords_file, limit=1)
click.secho(
('Match Word:: {}'.format(matched_word)), fg="yellow")
if set(line.lower().split()[:-1]) & pos_keywords:
click.secho(
('Positive::{}'.format(line)), fg="blue")
label = "Positive"
elif set(line.lower().split()[:-1]) & neg_keywords:
click.secho(
('Negative::{}'.format(line)), fg="red")
label = "Negative"
else:
label = 'Neutral'
result = '"Filename":"{}"|\n"Matched_Sentence":"{}"|\n"Label":"{}"\n'.format(
master_file.name, line, label)
print(result)
writer.writerow((f, line.strip(), label))
click.secho(
('Finished Task For: {}'.format(line)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
elif set(line.lower().split()[:-1]) not in keywords_file:
click.secho(
('None Found::{}'.format(line)), fg="green")
click.secho(('Finished Task For: {}'.format(f)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
@main.command()
@click.argument('main_file')
@click.argument('keywords')
@click.argument('positive')
@click.argument('negative')
def matchfiles(main_file, keywords, positive, negative):
""" Match File 1 with Keywords From 3 Files
eg. matchtagger matchfiles masterfile.txt keywordfile.txt positivefile.txt negativefile.txt
eg. python matchtagger matchfiles masterfile.txt keywordfile.txt positivefile.txt negativefile.txt
"""
with open(os.path.join(keywords)) as second_file:
keywords_file = set(second_file.read().lower().split())
with open(os.path.join(positive)) as third_file:
pos_keywords = set(third_file.read().lower().split())
with open(os.path.join(negative)) as fourth_file:
neg_keywords = set(fourth_file.read().lower().split())
# File Name
results_file = "extracted_results" + timestr + '.csv'
# Open and Match Files
with open(os.path.join(main_file), errors='replace') as master_file:
with open(results_file, "a+") as finalfile:
writer = csv.writer(finalfile, delimiter='|')
for line in master_file:
if set(line.lower().split()[:-1]) & keywords_file:
click.echo('Found Match:: {}'.format(line))
matched_word = process.extract(
line, keywords_file, limit=1)
click.secho(
('Match Word:: {}'.format(matched_word)), fg="yellow")
if set(line.lower().split()[:-1]) & pos_keywords:
click.secho(('Positive::{}'.format(line)), fg="blue")
label = "Positive"
if set(line.lower().split()[:-1]) & neg_keywords:
click.secho(('Negative::{}'.format(line)), fg="red")
label = "Negative"
result = '"Filename":"{}"|\n"Matched_Sentence":"{}"|\n"Label":"{}"\n'.format(
master_file.name, line, label)
print(result)
writer.writerow(
(main_file, line.strip(), label))
click.secho(
('Finished Task For: {}'.format(line)), fg="blue")
click.secho(
('Saved Result in File: {}'.format(results_file)), fg="blue")
elif set(line.lower().split()[:-1]) not in keywords_file:
click.secho(('None Found::{}'.format(line)), fg="green")
else:
click.secho(('Neutral::{}'.format(line)), fg="green")
label = "Neutral"
click.secho(('Saved Result in File: {}'.format(results_file)), fg="blue")
if __name__ == '__main__':
main()
但是 输出 UnicodeDecodeError:'utf-8'编解码器无法解码位置 21 中的字节 0xb7:起始字节无效
是否与音符编码有关,如果是,我该如何在代码中修复它
您的文件可能与您使用的编码不同。 阅读此处了解 python link
中的不同编码希望对您有所帮助。