从 python-pptx 中的 XY 散点图中获取 x_values
Getting the x_values from XY Scatter plot in python-pptx
我正在尝试使用 python-pptx 提取 XY 散点数据,以应对参考 excel sheet 不存在且我现有 VBA 的特殊情况代码无法读取数据。我可以获得所有 y_values,但我一直无法弄清楚如何获得 x_values
。从文档中,我了解到 XY 散点图不像其他图那样具有 "categories"。但是,我在图表或 chart.plots[0]
中看不到任何方法或对象可以让我访问 x_values
。我只看到 "categories"
,当然,对于 XY 散点图它是空的。
def get_chart_data():
for sld in prs.slides:
for shape in sld.shapes:
if shape.has_chart:
chart = shape.chart
series_data=[]
for series in chart.series:
y_val = []
for value in series.values:
y_val.append(value)
目前还没有 API 支持,所以如果你想要它足够糟糕,你必须深入到 lxml
级别的调用才能获得它。
此 python-pptx
分析文档中显示了 XY 散点图的 XML 示例:https://python-pptx.readthedocs.io/en/latest/dev/analysis/cht-xy-chart.html#xml-specimen。这是部分片段:
<c:chart>
<c:scatterChart>
<c:ser>
...
<c:xVal>
<c:numRef>
<c:f>Sheet1!$A:$A</c:f>
<c:numCache>
<c:formatCode>General</c:formatCode>
<c:ptCount val="6"/>
<c:pt idx="0">
<c:v>0.7</c:v>
</c:pt>
<c:pt idx="1">
<c:v>1.8</c:v>
</c:pt>
<c:pt idx="2">
<c:v>2.6</c:v>
</c:pt>
</c:numCache>
</c:numRef>
</c:xVal>
<c:yVal>
<c:numRef>
<c:f>Sheet1!$B:$B</c:f>
<c:numCache>
<c:formatCode>General</c:formatCode>
<c:ptCount val="6"/>
<c:pt idx="0">
<c:v>2.7</c:v>
</c:pt>
<c:pt idx="1">
<c:v>3.2</c:v>
</c:pt>
<c:pt idx="2">
<c:v>0.8</c:v>
</c:pt>
</c:numCache>
</c:numRef>
</c:yVal>
</c:ser>
...
<c:scatterChart>
...
<c:chart>
点数据在 c:xVal
元素和 c:yVal
元素之间划分。您可以使用 XPath
和 lxml.etree._Element
调用来获取它:
for series in chart.series:
ser = series._ser
x_pts = ser.xpath(".//c:xVal//c:pt")
for pt in x_pts:
print("pt.idx == %s", pt.get("idx"))
str_value = pt.xpath("./c:v")[0].text
value = float(str_value)
print("value == %s" % value)
您需要扩展这种通用方法以获取 Y 值,根据匹配 idx
匹配成 (x, y) 对(不依赖文档顺序),可能会丢弃任何不完整的集合;我依稀记得那是可能发生的。
我想 post 我根据@scanny 的回答得到的最终代码。
我这样做的目标是能够循环浏览 Power Point 幻灯片并从 XY 散点图中提取不再具有链接 excel sheet 的数据。
from pptx import Presentation
from pptx.chart.series import XySeries
import numpy as np
def get_chart_data(prs):
for sld in prs.slides:
for shape in sld.shapes:
if shape.has_chart:
chart = shape.chart
series_data = {}
series_data[shape.name] = {}
if isinstance(chart.series[0], XySeries): #check if XY Series
for series in chart.series:
x_values, y_values = read_xy(series)
#create dictionary with Chart name and series names
series_data[shape.name][series.name] = np.array([x_values, y_values])
for c in series_data.keys(): #get chart keys
for s in series_data[c].keys(): # get series keys
data_final = series_data[c][s].T #retrieve XY data for given chart and series
np.savetxt(f'{shape.name}_{s}.csv', data_final, delimiter=',')
def read_xy(series):
xVal = {}
yVal = {}
ser = series._ser
x_pts = ser.xpath(".//c:xVal//c:pt") # get all xVals from xml with xpath query
y_pts = ser.xpath(".//c:yVal//c:pt") # get all yVals from xml with xpath query
for i in range(len(x_pts)): #loop through all xVals
x_value = get_pt_val(x_pts[i]) #call function to get each x value
y_value = get_pt_val(y_pts[i]) #call function to get each y value
xVal[x_pts[i].idx] = x_value #store x value in dictionary
yVal[y_pts[i].idx] = y_value # store y value in dictionary
# in case x & y idx don't have matching pairs return keys that are common to both x & y
key = set.intersection(*tuple(set(d.keys()) for d in [xVal, yVal]))
xVal = [xVal[x] for x in key] #create xVal list
yVal = [yVal[x] for x in key] #create yVal list
return xVal, yVal
def get_pt_val(pt):
str_value = pt.xpath("./c:v")[0].text #retrieve point value
value = float(str_value)
return value
if __name__ == '__main__':
prs = Presentation('Test.pptx')
get_chart_data(prs)
我正在尝试使用 python-pptx 提取 XY 散点数据,以应对参考 excel sheet 不存在且我现有 VBA 的特殊情况代码无法读取数据。我可以获得所有 y_values,但我一直无法弄清楚如何获得 x_values
。从文档中,我了解到 XY 散点图不像其他图那样具有 "categories"。但是,我在图表或 chart.plots[0]
中看不到任何方法或对象可以让我访问 x_values
。我只看到 "categories"
,当然,对于 XY 散点图它是空的。
def get_chart_data():
for sld in prs.slides:
for shape in sld.shapes:
if shape.has_chart:
chart = shape.chart
series_data=[]
for series in chart.series:
y_val = []
for value in series.values:
y_val.append(value)
目前还没有 API 支持,所以如果你想要它足够糟糕,你必须深入到 lxml
级别的调用才能获得它。
此 python-pptx
分析文档中显示了 XY 散点图的 XML 示例:https://python-pptx.readthedocs.io/en/latest/dev/analysis/cht-xy-chart.html#xml-specimen。这是部分片段:
<c:chart>
<c:scatterChart>
<c:ser>
...
<c:xVal>
<c:numRef>
<c:f>Sheet1!$A:$A</c:f>
<c:numCache>
<c:formatCode>General</c:formatCode>
<c:ptCount val="6"/>
<c:pt idx="0">
<c:v>0.7</c:v>
</c:pt>
<c:pt idx="1">
<c:v>1.8</c:v>
</c:pt>
<c:pt idx="2">
<c:v>2.6</c:v>
</c:pt>
</c:numCache>
</c:numRef>
</c:xVal>
<c:yVal>
<c:numRef>
<c:f>Sheet1!$B:$B</c:f>
<c:numCache>
<c:formatCode>General</c:formatCode>
<c:ptCount val="6"/>
<c:pt idx="0">
<c:v>2.7</c:v>
</c:pt>
<c:pt idx="1">
<c:v>3.2</c:v>
</c:pt>
<c:pt idx="2">
<c:v>0.8</c:v>
</c:pt>
</c:numCache>
</c:numRef>
</c:yVal>
</c:ser>
...
<c:scatterChart>
...
<c:chart>
点数据在 c:xVal
元素和 c:yVal
元素之间划分。您可以使用 XPath
和 lxml.etree._Element
调用来获取它:
for series in chart.series:
ser = series._ser
x_pts = ser.xpath(".//c:xVal//c:pt")
for pt in x_pts:
print("pt.idx == %s", pt.get("idx"))
str_value = pt.xpath("./c:v")[0].text
value = float(str_value)
print("value == %s" % value)
您需要扩展这种通用方法以获取 Y 值,根据匹配 idx
匹配成 (x, y) 对(不依赖文档顺序),可能会丢弃任何不完整的集合;我依稀记得那是可能发生的。
我想 post 我根据@scanny 的回答得到的最终代码。 我这样做的目标是能够循环浏览 Power Point 幻灯片并从 XY 散点图中提取不再具有链接 excel sheet 的数据。
from pptx import Presentation
from pptx.chart.series import XySeries
import numpy as np
def get_chart_data(prs):
for sld in prs.slides:
for shape in sld.shapes:
if shape.has_chart:
chart = shape.chart
series_data = {}
series_data[shape.name] = {}
if isinstance(chart.series[0], XySeries): #check if XY Series
for series in chart.series:
x_values, y_values = read_xy(series)
#create dictionary with Chart name and series names
series_data[shape.name][series.name] = np.array([x_values, y_values])
for c in series_data.keys(): #get chart keys
for s in series_data[c].keys(): # get series keys
data_final = series_data[c][s].T #retrieve XY data for given chart and series
np.savetxt(f'{shape.name}_{s}.csv', data_final, delimiter=',')
def read_xy(series):
xVal = {}
yVal = {}
ser = series._ser
x_pts = ser.xpath(".//c:xVal//c:pt") # get all xVals from xml with xpath query
y_pts = ser.xpath(".//c:yVal//c:pt") # get all yVals from xml with xpath query
for i in range(len(x_pts)): #loop through all xVals
x_value = get_pt_val(x_pts[i]) #call function to get each x value
y_value = get_pt_val(y_pts[i]) #call function to get each y value
xVal[x_pts[i].idx] = x_value #store x value in dictionary
yVal[y_pts[i].idx] = y_value # store y value in dictionary
# in case x & y idx don't have matching pairs return keys that are common to both x & y
key = set.intersection(*tuple(set(d.keys()) for d in [xVal, yVal]))
xVal = [xVal[x] for x in key] #create xVal list
yVal = [yVal[x] for x in key] #create yVal list
return xVal, yVal
def get_pt_val(pt):
str_value = pt.xpath("./c:v")[0].text #retrieve point value
value = float(str_value)
return value
if __name__ == '__main__':
prs = Presentation('Test.pptx')
get_chart_data(prs)