Python 脚本因大量数据而失败

Question

我写了一个 python 脚本，看起来运行良好。它列出了 AWS 中的 EC2 实例，然后将它们写入我们的 Confluence wiki。

如果它处理一个有 10 台服务器的环境，它会工作并写入 Confluence。如果它对一个拥有 100 台或更多服务器的帐户有效，它无法使用此堆栈跟踪写入 Confluence：

Traceback (most recent call last):
  File ".\aws_ec2_list_instances_orig.py", line 550, in <module>
    main()
  File ".\aws_ec2_list_instances_orig.py", line 543, in main
    write_data_to_confluence(auth, html, pageid, title)
  File ".\aws_ec2_list_instances_orig.py", line 391, in write_data_to_confluence
    r.raise_for_status()
  File "C:\Users\tdunphy\AppData\Roaming\Python\Python37\site-packages\requests\models.py", line 940, in raise_for_status
requests.exceptions.HTTPError: 400 Client Error:  for url: https://wiki.us.cworld.company.com/rest/api/content/138317098

我在这里也提出了一个详细的错误：

Traceback (most recent call last):
  File ".\aws_ec2_list_instances_orig.py", line 538, in <module>
    main()
  File ".\aws_ec2_list_instances_orig.py", line 531, in main
    write_data_to_confluence(auth, html, pageid, title)
  File ".\aws_ec2_list_instances_orig.py", line 380, in write_data_to_confluence
    raise RuntimeError(r.content)
RuntimeError: b'{"statusCode":400,"data":{"authorized":false,"valid":true,"allowedInReadOnlyMode":true,"errors":[],"successful":false},"message":"Error parsing xhtml: Unexpected character \'<\' (code 60); expected a semi-colon after the reference for entity \'C\'\n at [row,col {unknown-source}]: [1,46579]","reason":"Bad Request"}'

请注意，我不允许 POST 我的 POSTS 中的公司域。我将用 'company.com' 代替我真实的公司域名。

这是脚本：

#!/usr/bin/env python3

# Import modules
import boto3
import time
import objectpath
import csv
import os
import sys
import json
import requests
from requests_kerberos import HTTPKerberosAuth
import codecs
from datetime import datetime
from os.path import basename
from subprocess import check_output,CalledProcessError,PIPE

BASE_URL = "https://wiki.us.cworld.company.com/rest/api/content"
VIEW_URL = "https://wiki.us.cworld.company.com/pages/viewpage.action?pageId="

def banner(message, border='-'):
    line = border * len(message)
    print(line)
    print(message)
    print(line)

def initialize(interactive, aws_account):
    # Set the date
    today = datetime.today()
    today = today.strftime("%m-%d-%Y")
    # Set source files
    aws_env_list="../../source_files/aws_environments/aws_environments_all.txt"
    output_dir = "../../output_files/aws_instance_list/csv/"
    output_file = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
    output_file_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
    return today, aws_env_list, output_file, output_file_name

def authenticate():
    #auth = get_login()
    auth = ('tdunphy', 'local4tl4nt1cNJ!')
    auth = str(auth).replace('(','').replace('\'','').replace(',',':').replace(')','').replace(' ','')
    kerberos_auth = HTTPKerberosAuth(mutual_authentication="DISABLED",principal=auth)
    auth = kerberos_auth
    return auth

## These are dummy AWS account numbers. I cannot post account number for my company.
def aws_accounts_to_account_numbers(aws_account):
    switcher = {
        'company-lab': '123456789101',
        'company-bill': '123456789102',
        'company-stage': '123456789103',
        'company-dlab': '123456789103',
    }
    return switcher.get(aws_account, "nothing")


def list_instances(aws_account,aws_account_number, interactive):
    today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
    engagement = None
    # Set the account
    session = boto3.Session(profile_name=aws_account)
    ec2 = session.client("ec2")
    fieldnames = [ 'AWS Account', 'Account Number', 'Name', 'Instance ID', 'VPC ID', 'Type', 'Platform', 'State', 'Key Name', 'Private IP', 'Public IP', 'Private DNS', 'Volumes', 'Availability Zone', 'Launch Date', 'Engagement Code']
    # Set the ec2 dictionary
    ec2info = {}
    public_ips_list = ''
    private_ips_list = ''
    private_dns = None
    with open(output_file, mode='w+') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
        writer.writeheader()
    if 'gov' in aws_account and not 'admin' in aws_account:
        print("This is a gov account.")
        session = boto3.Session(profile_name=aws_account,region_name='us-gov-west-1')
    else:
        print("This is a commercial account.")
        session = boto3.Session(profile_name=aws_account,region_name='us-east-1')

    ec2 = session.client("ec2")
    # Loop through the instances
    instance_list = ec2.describe_instances()
    for reservation in instance_list["Reservations"]:
            for instance in reservation.get("Instances", []):
                launch_time = instance["LaunchTime"]
                launch_time_friendly = launch_time.strftime("%B %d %Y")
                tree = objectpath.Tree(instance)
                block_devices = set(tree.execute('$..BlockDeviceMappings[\'Ebs\'][\'VolumeId\']'))
                if len(block_devices) == 0:
                    block_devices_list = None
                else:
                    block_devices_list = list(block_devices)
                    block_devices_list = str(block_devices_list).replace('[','').replace(']','').replace('\'','').replace('{','').replace('}', '')
                private_ips =  set(tree.execute('$..PrivateIpAddress'))
                if len(private_ips) == 0:
                    private_ips_list = None
                else:
                    private_ips_list = list(private_ips)
                    private_ips_list = str(private_ips_list).replace('[','').replace(']','').replace('\'','')
                public_ips =  set(tree.execute('$..PublicIp'))
                if len(public_ips) == 0:
                    public_ips_list = None
                else:
                    public_ips_list = list(public_ips)
                    public_ips_list = str(public_ips_list).replace('[','').replace(']','').replace('\'','')
                if 'KeyName' in instance:
                    key_name = instance['KeyName']
                else:
                    key_name = None
                name = None
                if 'Tags' in instance:
                    try:
                        tags = instance['Tags']
                        name = None
                        for tag in tags:
                            if tag["Key"] == "Name":
                                name = tag["Value"]
                        for tag in tags:
                            if tag["Key"] == "Engagement":
                                engagement = tag["Value"]
                            else:
                                engagement = None
                    except ValueError:
                        print("Instance: %s has no tags" % instance_id)
                if 'VpcId' in instance:
                    vpc_id = instance['VpcId']
                else:
                    vpc_id = None
                if 'PrivateDnsName' in instance:
                    private_dns = instance['PrivateDnsName']
                else:
                    private_dns = None
                if 'Platform' in instance:
                    platform = instance['Platform']
                else:
                    platform = None
                ec2info[instance['InstanceId']] = {
                    'AWS Account': aws_account,
                    'Account Number': aws_account_number,
                    'Name': name,
                    'Instance ID': instance['InstanceId'],
                    'VPC ID': vpc_id,
                    'Type': instance['InstanceType'],
                    'Platform': platform,
                    'State': instance['State']['Name'],
                    'Key Name': key_name,
                    'Private IP': private_ips_list,
                    'Public IP': public_ips_list,
                    'Private DNS': private_dns,
                    'Volumes': block_devices_list,
                    'Availability Zone': instance['Placement']['AvailabilityZone'],
                    'Launch Date': launch_time_friendly,
                    'Engagement Code': engagement
                }
                with open(output_file,'a') as csv_file:
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
                    writer.writerow({'AWS Account': aws_account, "Account Number": aws_account_number, 'Name': name, 'Instance ID': instance["InstanceId"], 'VPC ID': vpc_id, 'Type': instance["InstanceType"], 'Platform': platform, 'State': instance["State"]["Name"], 'Key Name': key_name,  'Private IP': private_ips_list, 'Public IP': public_ips_list, 'Private DNS': private_dns, 'Volumes': block_devices, 'Availability Zone': instance['Placement']['AvailabilityZone'], 'Launch Date': launch_time_friendly, 'Engagement Code': engagement})
    for instance_id, instance in ec2info.items():
        print(Fore.RESET + "-------------------------------------")
        for key in [
            'AWS Account',
            'Account Number',
            'Name',
            'Instance ID',
            'VPC ID',
            'Type',
            'Platform',
            'Key Name',
            'State',
            'Private IP',
            'Public IP',
            'Private DNS',
            'Volumes',
            'Availability Zone',
            'Launch Date',
            'Engagement Code'
        ]:
            print(Fore.GREEN + "{0}: {1}".format(key, instance.get(key)))   
        time.sleep(2)
    print(Fore.RESET + "-------------------------------------")
    with open(output_file,'a') as csv_file:
        csv_file.close()
    return output_file


def convert_csv_to_html_table(output_file, today, interactive, aws_account):
    output_dir = "../../output_files/aws_instance_list/html/"
    if interactive == 1:
        htmlfile = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
        htmlfile_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
    else:
        htmlfile = output_dir + 'aws-instance-master-list-' + today +'.html'
        htmlfile_name = 'aws-instance-master-list-' + today +'.html'
    count = 0
    html = ''
    with open(output_file,'r') as CSVFILE:
        reader = csv.reader(CSVFILE)
        with open(output_file,'r') as CSVFILE:
            reader = csv.reader(CSVFILE)
            html += "<table><tbody>"
            for row in reader:
                html += "<tr>"
                # Process the headers
                if count == 0:
                    for column in row:
                        html += "<th>%s</th>" % column
                else:
                    # Process the data
                    for column in row:
                        html += "<td>%s</td>" % column
                html += "</tr>"
                count += 1
            html += "</tbody></table>"
    with open(htmlfile,'w+') as HTMLFILE:
        HTMLFILE.write(html)
    return htmlfile, htmlfile_name


def get_page_ancestors(auth, pageid):
    # Get basic page information plus the ancestors property
    url = '{base}/{pageid}?expand=ancestors'.format(
        base = BASE_URL,
        pageid = pageid)
    r = requests.get(url, auth = auth)
    r.raise_for_status()
    return r.json()['ancestors']


def get_page_info(auth, pageid):
    url = '{base}/{pageid}'.format(
        base = BASE_URL,
        pageid = pageid)
    r = requests.get(url, auth = auth)
    r.raise_for_status()
    return r.json()


def write_data_to_confluence(auth, html, pageid, title = None):
    info = get_page_info(auth, pageid)
    ver = int(info['version']['number']) + 1
    ancestors = get_page_ancestors(auth, pageid)
    anc = ancestors[-1]
    del anc['_links']
    del anc['_expandable']
    del anc['extensions']
    if title is not None:
        info['title'] = title
    data = {
        'id' : str(pageid),
        'type' : 'page',
        'title' : info['title'],
        'version' : {'number' : ver},
        'ancestors' : [anc],
        'body'  : {
            'storage' :
            {
                'representation' : 'storage',
                'value' : str(html)
            }
        }
    }
    data = json.dumps(data)
    url = '{base}/{pageid}'.format(base = BASE_URL, pageid = pageid)
    r = requests.put(
        url,
        data = data,
        auth = auth,
        headers = { 'Content-Type' : 'application/json' }
    )
    r.raise_for_status()
    print("Wrote '%s' version %d" % (info['title'], ver))
    print("URL: %s%d" % (VIEW_URL, pageid))

def main():
    pageid = 138317098
    title = 'AWS EC2 Instance List'
    aws_account = input("Enter the name of the AWS account you'll be working in: ")
    aws_account_number = aws_accounts_to_account_numbers(aws_account)
    today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
    output_file = list_instances(aws_account,aws_account_number, interactive)
    htmlfile, htmlfile_name = convert_csv_to_html_table(output_file, today, interactive, aws_account)
    with open(htmlfile, 'r', encoding='utf-8') as htmlfile:
        html = htmlfile.read()
    auth = authenticate()
    write_data_to_confluence(auth, html, pageid, title)


if __name__ == "__main__":
    main()

为什么这个脚本只有在处理很多服务器的时候才会写入confluence？

Answer 1

在无法查看您正在使用的数据的情况下进行诊断有点棘手。正如 fpbhb 所指出的，错误消息表明生成的 html 存在问题（可能是输入中某处的 &）。我会尝试转义 CSV 字段数据，然后再将其包装在 HTML 标签中：

from html import escape


def convert_csv_to_html_table(output_file, today, interactive, aws_account):
    # [...]

    count = 0
    html = ''
    with open(output_file,'r') as CSVFILE:
        reader = csv.reader(CSVFILE)
        html += "<table><tbody>"
        for row in reader:
            html += "<tr>"
            # Process the headers
            if count == 0:
                for column in row:
                    html += "<th>%s</th>" % escape(column)
            else:
                # Process the data
                for column in row:
                    html += "<td>%s</td>" % escape(column)
            html += "</tr>"
            count += 1
        html += "</tbody></table>"
    with open(htmlfile,'w+') as HTMLFILE:
        HTMLFILE.write(html)
    return htmlfile, htmlfile_name

Python 脚本因大量数据而失败

Python script fails with a lot of data

python

confluence

amazon-ec2