从人口普查下载数据块,如何写入多个 csvs 不超过内存
Downloading blocks of data from census, how to wrtie to multiple csvs to not exceed memory
假设我有一个 api 键的列表,我正在从人口普查数据中下载
示例:
variable_list = [
'B08006_017E',
'B08016_002E',
'B08016_003E',
'B08016_004E',
...
]
现在考虑将此数据放入一个 csv 文件的内存限制。我想创建一种方法,将变量列表中的 100 个变量块放置到多个 csv 文件中。例如,如果我有 200 个变量,那么我将有前 100 个的 2 个 csv 文件和一个包含后 100 个变量的 csv 文件。我希望这是清楚的。
这是我目前下载数据的方式:
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
#import statsmodels.formula.api as sm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import censusgeocode as cg
import numpy as np
from numbers import Number
import plotly
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import requests
import pandas
import geopandas
import json
import math
from haversine import haversine
from ipfn import ipfn
import networkx
from matplotlib import pyplot
from matplotlib import patheffects
from shapely.geometry import LineString, MultiLineString
variable_list1 = [
'B08006_017E',
'B08016_002E'
'B08016_003E',
'B08016_004E'
]
all_variable_lists = [variable_list1]
print(len(all_variable_lists[0]))
#2) For each year, download the relevant variables for each tract
def download_year(year,variable_list,State,County,Tract):
df = censusdata.download('acs5', year, censusdata.censusgeo([('state',State),('county',County),('tract',Tract)]), variable_list, key = 'e39a53c23358c749629da6f31d8f03878d4088d6')
df['Year']=str(year)
return df
#3) Define function to download for a single year and state
def callback_arg(i,variable_list,year):
try:
print('Downloading - ',year,'State', i,' of 57')
if i<10:
df = download_year(year,variable_list,'0'+str(i),'*','*')
return df
if i==51:
df = download_year(year,variable_list,str(i),'*','*')
return df
else:
df = download_year(year,variable_list,str(i),'*','*')
return df
except:
pass
#3) Function to download for all states and all years, do some slight formatting
def download_all_data(variable_list,max_year):
df=download_year(2012,variable_list,'01','*','*')
for year in range(2012,max_year+1):
if year == 2012:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
else:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
df2=df.reset_index()
df2=df2.rename(columns = {"index": "Location+Type"}).astype(str)
df2['state']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[2]
df2['Census_tract']=df2["Location+Type"].str.split(':').str[0].str.split(',').str[0].str.split(' ').str[2][0]
df2['County_name']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[1]
return(df2)
#4) Some slight formatting
def write_to_csv(df2,name = 'test'):
df2.to_csv(name)
#5) The line below is commented out, but should run the entire download sequence
def write_to_csv(df, ide):
df.to_csv('test' + str(ide) + '.csv')
list_of_dfs = []
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
x1 = list_of_dfs[0].reset_index()
# x3 = pd.merge(x1,x2, on=['index','Location+Type','Year','state','Census_tract','County_name'])
write_to_csv(x1,1)
如果有人能给我一些关于如何实现我想要的想法,这将对我有很大帮助。谢谢。
您似乎已经在此处分块 variable_lists:
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
只需确保每个 var_list
只有 100 个项目。然后以相同的方式分块 csv 写入,使用 enumerate
增加文件名的索引:
for index, out_list in enumerate(list_of_dfs):
write_to_csv(out_list.reset_index(),index)
如果您只是想在写入时分解最终输出:
for index, out_list in enumerate(np.array_split(x1, 100)):
write_to_csv(out_list,index)
假设我有一个 api 键的列表,我正在从人口普查数据中下载
示例:
variable_list = [
'B08006_017E',
'B08016_002E',
'B08016_003E',
'B08016_004E',
...
]
现在考虑将此数据放入一个 csv 文件的内存限制。我想创建一种方法,将变量列表中的 100 个变量块放置到多个 csv 文件中。例如,如果我有 200 个变量,那么我将有前 100 个的 2 个 csv 文件和一个包含后 100 个变量的 csv 文件。我希望这是清楚的。
这是我目前下载数据的方式:
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
#import statsmodels.formula.api as sm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import censusgeocode as cg
import numpy as np
from numbers import Number
import plotly
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import requests
import pandas
import geopandas
import json
import math
from haversine import haversine
from ipfn import ipfn
import networkx
from matplotlib import pyplot
from matplotlib import patheffects
from shapely.geometry import LineString, MultiLineString
variable_list1 = [
'B08006_017E',
'B08016_002E'
'B08016_003E',
'B08016_004E'
]
all_variable_lists = [variable_list1]
print(len(all_variable_lists[0]))
#2) For each year, download the relevant variables for each tract
def download_year(year,variable_list,State,County,Tract):
df = censusdata.download('acs5', year, censusdata.censusgeo([('state',State),('county',County),('tract',Tract)]), variable_list, key = 'e39a53c23358c749629da6f31d8f03878d4088d6')
df['Year']=str(year)
return df
#3) Define function to download for a single year and state
def callback_arg(i,variable_list,year):
try:
print('Downloading - ',year,'State', i,' of 57')
if i<10:
df = download_year(year,variable_list,'0'+str(i),'*','*')
return df
if i==51:
df = download_year(year,variable_list,str(i),'*','*')
return df
else:
df = download_year(year,variable_list,str(i),'*','*')
return df
except:
pass
#3) Function to download for all states and all years, do some slight formatting
def download_all_data(variable_list,max_year):
df=download_year(2012,variable_list,'01','*','*')
for year in range(2012,max_year+1):
if year == 2012:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
else:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
df2=df.reset_index()
df2=df2.rename(columns = {"index": "Location+Type"}).astype(str)
df2['state']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[2]
df2['Census_tract']=df2["Location+Type"].str.split(':').str[0].str.split(',').str[0].str.split(' ').str[2][0]
df2['County_name']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[1]
return(df2)
#4) Some slight formatting
def write_to_csv(df2,name = 'test'):
df2.to_csv(name)
#5) The line below is commented out, but should run the entire download sequence
def write_to_csv(df, ide):
df.to_csv('test' + str(ide) + '.csv')
list_of_dfs = []
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
x1 = list_of_dfs[0].reset_index()
# x3 = pd.merge(x1,x2, on=['index','Location+Type','Year','state','Census_tract','County_name'])
write_to_csv(x1,1)
如果有人能给我一些关于如何实现我想要的想法,这将对我有很大帮助。谢谢。
您似乎已经在此处分块 variable_lists:
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
只需确保每个 var_list
只有 100 个项目。然后以相同的方式分块 csv 写入,使用 enumerate
增加文件名的索引:
for index, out_list in enumerate(list_of_dfs):
write_to_csv(out_list.reset_index(),index)
如果您只是想在写入时分解最终输出:
for index, out_list in enumerate(np.array_split(x1, 100)):
write_to_csv(out_list,index)