如何在目录中的所有文件类型中搜索正则表达式
How to search all file types in directory for regular expression
因此,我想在我的整个目录中搜索包含正则表达式列表的文件。这包括:目录、pdf 和 csv 文件。仅搜索文本文件时我可以成功完成此任务,但搜索所有文件类型却很困难。以下是我目前的工作:
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+@\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
read_file = open(file, 'rt').read()
if readile_file == pdf:
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content= pageObj.extractText())
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)
当我 运行 这个时,我得到这个错误:
YES, This file containts PHI
/home/e136320/sample.txt
No, This file DOES NOT contain PHI
/home/e136320/medicalSample.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-129-be0b68229c20> in <module>()
19
20 for file in folder_contents:
---> 21 read_file = open(file, 'rt').read()
22 if readile_file == pdf:
23 # creating a pdf file object
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-128-1537605cf636> in <module>()
18
19 for file in folder_contents:
---> 20 read_file = open(file, 'rt').read()
21 if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
22 print ("YES, This file containts PHI")
/jupyterhub_env/lib/python3.5/codecs.py in decode(self, input, final)
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 10: invalid continuation byte
有什么建议吗?
您不能像这样打开 pdf 文件,它需要一个纯文本文件。你可以使用这样的东西:
fn, ext = os.path.splitext(file)
if ext == '.pdf':
open_function = PyPDF2.PdfFileReader
else: # plain text
open_function = open
with open_function(file, 'rt') as open_file:
# Do something with open file...
此代码段检查文件扩展名,然后根据它找到的内容分配一个打开函数,这有点天真,可以用与显示的方法类似的方法做得更好 in this answer。
您的问题发生在您以相同方式打开不同类型的文件时。你必须把它们整理出来。 csv可以直接读取,pdf不行。我使用 re.search(r".*(?=pdf$)",file)
来防止 2.pdf.csv
被认为是一个 pdf 文件
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/e136320/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+@\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
with open(file, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content = pageObj.extractText()
read_file = #
elif re.search(r".*(?=csv$)",file):
#this is csv
with open(file,"r+",encoding="utf-8") as csv:
read_file = csv.read()
else:
#print("{}".format(file))
continue
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)
因此,我想在我的整个目录中搜索包含正则表达式列表的文件。这包括:目录、pdf 和 csv 文件。仅搜索文本文件时我可以成功完成此任务,但搜索所有文件类型却很困难。以下是我目前的工作:
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+@\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
read_file = open(file, 'rt').read()
if readile_file == pdf:
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content= pageObj.extractText())
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)
当我 运行 这个时,我得到这个错误:
YES, This file containts PHI
/home/e136320/sample.txt
No, This file DOES NOT contain PHI
/home/e136320/medicalSample.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-129-be0b68229c20> in <module>()
19
20 for file in folder_contents:
---> 21 read_file = open(file, 'rt').read()
22 if readile_file == pdf:
23 # creating a pdf file object
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-128-1537605cf636> in <module>()
18
19 for file in folder_contents:
---> 20 read_file = open(file, 'rt').read()
21 if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
22 print ("YES, This file containts PHI")
/jupyterhub_env/lib/python3.5/codecs.py in decode(self, input, final)
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 10: invalid continuation byte
有什么建议吗?
您不能像这样打开 pdf 文件,它需要一个纯文本文件。你可以使用这样的东西:
fn, ext = os.path.splitext(file)
if ext == '.pdf':
open_function = PyPDF2.PdfFileReader
else: # plain text
open_function = open
with open_function(file, 'rt') as open_file:
# Do something with open file...
此代码段检查文件扩展名,然后根据它找到的内容分配一个打开函数,这有点天真,可以用与显示的方法类似的方法做得更好 in this answer。
您的问题发生在您以相同方式打开不同类型的文件时。你必须把它们整理出来。 csv可以直接读取,pdf不行。我使用 re.search(r".*(?=pdf$)",file)
来防止 2.pdf.csv
被认为是一个 pdf 文件
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/e136320/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+@\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
with open(file, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content = pageObj.extractText()
read_file = #
elif re.search(r".*(?=csv$)",file):
#this is csv
with open(file,"r+",encoding="utf-8") as csv:
read_file = csv.read()
else:
#print("{}".format(file))
continue
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)