使用 python-docx 中的表格
Working with tables in python-docx
我有一个关于使用打开的 docx 文件的小问题。
这是我的代码的一部分:
doc = Document(self.fileName[0])
for paragraph in doc.paragraphs:
self.cursor.insertText(paragraph.text + '\n')
for table_index, table in enumerate(doc.tables):
self.cursor.insertText('Таблица {0}\n'.format(table_index+1))
for row_index in range(len(table.rows)):
for column_index in range(len(table.columns)):
self.cursor.insertText(table.cell(row_index, column_index).text + '\t')
self.cursor.insertText('\n')
self.cursor.insertText('\n')
问题是我能找出 table 在物理上的位置吗
原始文档?
我需要以与文档中相同的顺序显示段落和 tables。
python-docx
API 尚不直接支持此操作。但是,您可以在此处找到解决方法:
https://github.com/python-openxml/python-docx/issues/40
通过搜索 'python-docx iter block items'.
可以找到更多信息
基本问题是 Microsoft API for Word 不包含按文档顺序迭代块级项目的方法。 Word 中的块级项目是段落和 table 对象。 python-docx
以 MS API 为起点建模,因此 Document.paragraphs
和 Document.tables
属性是最先实现的。 Document.iter_block_items()
或者可能只是 Document.block_items
尚未实现,尽管它比许多其他功能更接近增强列表的顶部,因为它经常被要求。
同时,您需要在自己的代码中实现变通功能。
感谢您对 https://github.com/python-openxml/python-docx/issues/40 的引用,尽管必须对找到的代码进行一些更新,可能是由于 ~5 年的变化,最终使用:
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
def iter_block_items(parent):
# Get parrent element
if isinstance(parent, Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
# Get children in parent element
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
此代码适用于此:
###Import all necessary packages
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import *
from docx.text.paragraph import Paragraph
from docx.text.paragraph import Run
import xml.etree.ElementTree as ET
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import Pt
from docxcompose.composer import Composer
from docx import Document as Document_compose
import pandas as pd
from xml.etree import ElementTree
from io import StringIO
import io
import csv
import base64
#Load the docx file into document object. You can input your own docx file in this step by changing the input path below:
document = Document('/Users/karthick/Desktop/iclouddrive/Work/QA/microchip datasheets/22100F-converted-latest.docx')
##This function extracts the tables and paragraphs from the document object
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
if isinstance(parent, doctwo):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
#This function extracts the table from the document object as a dataframe
def read_docx_tables(tab_id=None, **kwargs):
"""
parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)
Parameters:
filename: file name of a Word Document
tab_id: parse a single table with the index: [tab_id] (counting from 0).
When [None] - return a list of DataFrames (parse all tables)
kwargs: arguments to pass to `pd.read_csv()` function
Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
"""
def read_docx_tab(tab, **kwargs):
vf = io.StringIO()
writer = csv.writer(vf)
for row in tab.rows:
writer.writerow(cell.text for cell in row.cells)
vf.seek(0)
return pd.read_csv(vf, **kwargs)
# doc = Document(filename)
if tab_id is None:
return [read_docx_tab(tab, **kwargs) for tab in document.tables]
else:
try:
return read_docx_tab(document.tables[tab_id], **kwargs)
except IndexError:
print('Error: specified [tab_id]: {} does not exist.'.format(tab_id))
raise
#The combined_df dataframe will store all the content in document order including images, tables and paragraphs.
#If the content is an image or a table, it has to be referenced from image_df for images and table_list for tables using the corresponding image or table id that is stored in combined_df
#And if the content is paragraph, the paragraph text will be stored in combined_df
combined_df = pd.DataFrame(columns=['para_text','table_id','style'])
table_mod = pd.DataFrame(columns=['string_value','table_id'])
#The image_df will consist of base64 encoded image data of all the images in the document
image_df = pd.DataFrame(columns=['image_index','image_rID','image_filename','image_base64_string'])
#The table_list is a list consisting of all the tables in the document
table_list=[]
xml_list=[]
i=0
imagecounter = 0
blockxmlstring = ''
for block in iter_block_items(document):
if 'text' in str(block):
isappend = False
runboldtext = ''
for run in block.runs:
if run.bold:
runboldtext = runboldtext + run.text
style = str(block.style.name)
appendtxt = str(block.text)
appendtxt = appendtxt.replace("\n","")
appendtxt = appendtxt.replace("\r","")
tabid = 'Novalue'
paragraph_split = appendtxt.lower().split()
isappend = True
for run in block.runs:
xmlstr = str(run.element.xml)
my_namespaces = dict([node for _, node in ElementTree.iterparse(StringIO(xmlstr), events=['start-ns'])])
root = ET.fromstring(xmlstr)
#Check if pic is there in the xml of the element. If yes, then extract the image data
if 'pic:pic' in xmlstr:
xml_list.append(xmlstr)
for pic in root.findall('.//pic:pic', my_namespaces):
cNvPr_elem = pic.find("pic:nvPicPr/pic:cNvPr", my_namespaces)
name_attr = cNvPr_elem.get("name")
blip_elem = pic.find("pic:blipFill/a:blip", my_namespaces)
embed_attr = blip_elem.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
isappend = True
appendtxt = str('Document_Imagefile/' + name_attr + '/' + embed_attr + '/' + str(imagecounter))
document_part = document.part
image_part = document_part.related_parts[embed_attr]
image_base64 = base64.b64encode(image_part._blob)
image_base64 = image_base64.decode()
dftemp = pd.DataFrame({'image_index':[imagecounter],'image_rID':[embed_attr],'image_filename':[name_attr],'image_base64_string':[image_base64]})
image_df = image_df.append(dftemp,sort=False)
style = 'Novalue'
imagecounter = imagecounter + 1
elif 'table' in str(block):
isappend = True
style = 'Novalue'
appendtxt = str(block)
tabid = i
dfs = read_docx_tables(tab_id=i)
dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[i],'style':[style]})
table_mod = table_mod.append(dftemp,sort=False)
table_list.append(dfs)
i=i+1
if isappend:
dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[tabid],'style':[style]})
combined_df=combined_df.append(dftemp,sort=False)
combined_df = combined_df.reset_index(drop=True)
image_df = image_df.reset_index(drop=True)
你可以参考这个link来详细解释代码是如何工作的:
https://github.com/kmrambo/Python-docx-Reading-paragraphs-tables-and-images-in-document-order-
我有一个关于使用打开的 docx 文件的小问题。 这是我的代码的一部分:
doc = Document(self.fileName[0])
for paragraph in doc.paragraphs:
self.cursor.insertText(paragraph.text + '\n')
for table_index, table in enumerate(doc.tables):
self.cursor.insertText('Таблица {0}\n'.format(table_index+1))
for row_index in range(len(table.rows)):
for column_index in range(len(table.columns)):
self.cursor.insertText(table.cell(row_index, column_index).text + '\t')
self.cursor.insertText('\n')
self.cursor.insertText('\n')
问题是我能找出 table 在物理上的位置吗 原始文档? 我需要以与文档中相同的顺序显示段落和 tables。
python-docx
API 尚不直接支持此操作。但是,您可以在此处找到解决方法:
https://github.com/python-openxml/python-docx/issues/40
通过搜索 'python-docx iter block items'.
基本问题是 Microsoft API for Word 不包含按文档顺序迭代块级项目的方法。 Word 中的块级项目是段落和 table 对象。 python-docx
以 MS API 为起点建模,因此 Document.paragraphs
和 Document.tables
属性是最先实现的。 Document.iter_block_items()
或者可能只是 Document.block_items
尚未实现,尽管它比许多其他功能更接近增强列表的顶部,因为它经常被要求。
同时,您需要在自己的代码中实现变通功能。
感谢您对 https://github.com/python-openxml/python-docx/issues/40 的引用,尽管必须对找到的代码进行一些更新,可能是由于 ~5 年的变化,最终使用:
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
def iter_block_items(parent):
# Get parrent element
if isinstance(parent, Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
# Get children in parent element
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
此代码适用于此:
###Import all necessary packages
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import *
from docx.text.paragraph import Paragraph
from docx.text.paragraph import Run
import xml.etree.ElementTree as ET
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import Pt
from docxcompose.composer import Composer
from docx import Document as Document_compose
import pandas as pd
from xml.etree import ElementTree
from io import StringIO
import io
import csv
import base64
#Load the docx file into document object. You can input your own docx file in this step by changing the input path below:
document = Document('/Users/karthick/Desktop/iclouddrive/Work/QA/microchip datasheets/22100F-converted-latest.docx')
##This function extracts the tables and paragraphs from the document object
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
if isinstance(parent, doctwo):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
#This function extracts the table from the document object as a dataframe
def read_docx_tables(tab_id=None, **kwargs):
"""
parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)
Parameters:
filename: file name of a Word Document
tab_id: parse a single table with the index: [tab_id] (counting from 0).
When [None] - return a list of DataFrames (parse all tables)
kwargs: arguments to pass to `pd.read_csv()` function
Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
"""
def read_docx_tab(tab, **kwargs):
vf = io.StringIO()
writer = csv.writer(vf)
for row in tab.rows:
writer.writerow(cell.text for cell in row.cells)
vf.seek(0)
return pd.read_csv(vf, **kwargs)
# doc = Document(filename)
if tab_id is None:
return [read_docx_tab(tab, **kwargs) for tab in document.tables]
else:
try:
return read_docx_tab(document.tables[tab_id], **kwargs)
except IndexError:
print('Error: specified [tab_id]: {} does not exist.'.format(tab_id))
raise
#The combined_df dataframe will store all the content in document order including images, tables and paragraphs.
#If the content is an image or a table, it has to be referenced from image_df for images and table_list for tables using the corresponding image or table id that is stored in combined_df
#And if the content is paragraph, the paragraph text will be stored in combined_df
combined_df = pd.DataFrame(columns=['para_text','table_id','style'])
table_mod = pd.DataFrame(columns=['string_value','table_id'])
#The image_df will consist of base64 encoded image data of all the images in the document
image_df = pd.DataFrame(columns=['image_index','image_rID','image_filename','image_base64_string'])
#The table_list is a list consisting of all the tables in the document
table_list=[]
xml_list=[]
i=0
imagecounter = 0
blockxmlstring = ''
for block in iter_block_items(document):
if 'text' in str(block):
isappend = False
runboldtext = ''
for run in block.runs:
if run.bold:
runboldtext = runboldtext + run.text
style = str(block.style.name)
appendtxt = str(block.text)
appendtxt = appendtxt.replace("\n","")
appendtxt = appendtxt.replace("\r","")
tabid = 'Novalue'
paragraph_split = appendtxt.lower().split()
isappend = True
for run in block.runs:
xmlstr = str(run.element.xml)
my_namespaces = dict([node for _, node in ElementTree.iterparse(StringIO(xmlstr), events=['start-ns'])])
root = ET.fromstring(xmlstr)
#Check if pic is there in the xml of the element. If yes, then extract the image data
if 'pic:pic' in xmlstr:
xml_list.append(xmlstr)
for pic in root.findall('.//pic:pic', my_namespaces):
cNvPr_elem = pic.find("pic:nvPicPr/pic:cNvPr", my_namespaces)
name_attr = cNvPr_elem.get("name")
blip_elem = pic.find("pic:blipFill/a:blip", my_namespaces)
embed_attr = blip_elem.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
isappend = True
appendtxt = str('Document_Imagefile/' + name_attr + '/' + embed_attr + '/' + str(imagecounter))
document_part = document.part
image_part = document_part.related_parts[embed_attr]
image_base64 = base64.b64encode(image_part._blob)
image_base64 = image_base64.decode()
dftemp = pd.DataFrame({'image_index':[imagecounter],'image_rID':[embed_attr],'image_filename':[name_attr],'image_base64_string':[image_base64]})
image_df = image_df.append(dftemp,sort=False)
style = 'Novalue'
imagecounter = imagecounter + 1
elif 'table' in str(block):
isappend = True
style = 'Novalue'
appendtxt = str(block)
tabid = i
dfs = read_docx_tables(tab_id=i)
dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[i],'style':[style]})
table_mod = table_mod.append(dftemp,sort=False)
table_list.append(dfs)
i=i+1
if isappend:
dftemp = pd.DataFrame({'para_text':[appendtxt],'table_id':[tabid],'style':[style]})
combined_df=combined_df.append(dftemp,sort=False)
combined_df = combined_df.reset_index(drop=True)
image_df = image_df.reset_index(drop=True)
你可以参考这个link来详细解释代码是如何工作的:
https://github.com/kmrambo/Python-docx-Reading-paragraphs-tables-and-images-in-document-order-