解析 csv 文件,将其列更改为行,将行更改为列
Parsing a csv file, changing its columns into rows and rows into columns
我有一个 CSV 文件,其中第一行是日期,第一列是扇区。像这样。
Date,7/2/2007,7/3/2007,7/5/2007,7/6/2007,7/9/2007
A,0,1,3,2,0
AA,23,423,2,0,0
AAL,34,23,5,0,234
AGCG,234,0,9,234,23
XL,0,65,34,34,34
所以现在我想准备另一个文件,就像
Date,Sector
7/2/2007,AA
7/2/2007,AAL
7/2/2007,AGCG
7/3/2007,A
7/3/2007,AA
7/3/2007,AAL
7/3/2007,XL
...
背后的逻辑是,我希望每个日期的扇区不为 0。
到目前为止我尝试过的代码是:
导入 csv,sys
从集合导入 defaultdict
dd = defaultdict(list)
dateList = []
header = False
def createFile(di):
ff = open("cum_file.csv","w")
csvwriter = csv.writer(ff)
row = []
for d,t in di.iteritems():
for tt in t:
print tt,d
row = [tt,d]
csvwriter.writerow(row)
del row[:]
#with open("./data/StrategyAcctValue-Daily.csv") as f:
with open("./try/test.csv") as f:
reader = csv.reader(f,delimiter=",")
for line in reader:
col1 = True
if header:
#sys.exit()
for eachCol in line:
if col1:
col1 = False
tkr = eachCol
elif eachCol != '0':
tkrIndex = line.index(eachCol)
tickerDate = dateList[tkrIndex - 1]
dd[tickerDate].append(tkr)
else:
continue
#print dd
#createFile(dd)
#sys.exit()
else:
header = True
for eachCol in line:
# print line.index(eachCol)
# continue
if col1:
col1 = False
tkr = eachCol
else:
dd[eachCol] = []
dateList.append(eachCol)
print dateList
print dd
createFile(dd)
这给出了如下输出:
A 7/3/2007
AA 7/3/2007
AAL 7/3/2007
XL 7/3/2007
A 7/6/2007
AAL 7/9/2007
AGCG 7/9/2007
AA 7/2/2007
AAL 7/2/2007
AGCG 7/2/2007
AGCG 7/2/2007
A 7/5/2007
AA 7/5/2007
AAL 7/5/2007
AGCG 7/5/2007
XL 7/5/2007
XL 7/5/2007
XL 7/5/2007
我找不到哪里出错了。
with open("test.csv", 'r') as f:
data = list(zip(*(line.rstrip("\n").split(',') for line in f)))
temp = dict(enumerate(data[0][1:]))
for key, *values in data[1:]:
for index, value in enumerate(values):
if value != '0':
print(key, temp[index])
7/2/2007 AA
7/2/2007 AAL
7/2/2007 AGCG
7/3/2007 A
7/3/2007 AA
7/3/2007 AAL
7/3/2007 XL
7/5/2007 A
7/5/2007 AA
7/5/2007 AAL
7/5/2007 AGCG
7/5/2007 XL
7/6/2007 A
7/6/2007 AGCG
7/6/2007 XL
7/9/2007 AAL
7/9/2007 AGCG
7/9/2007 XL
使用 numpy,您可以将 csv 数据转换为数组(矩阵)。
然后你可以迭代转置矩阵。
import numpy as np
data = np.matrix([['Date','7/2/2007','7/3/2007','7/5/2007','7/6/2007','7/9/2007'],
['A',0,1,3,2,0],
['AA',23,423,2,0,0],
['AAL',34,23,5,0,234],
['AGCG',234,0,9,234,23],
['XL',0,65,34,34,34]])
#get the index of the places in a row (from header)
#data.T is transposed matrix
index = data.T[0]
# you iterate over the dates (row of the transposed matrix) skipping the header row
for date in data.T[1:]:
# get the non-zero element of the row, get the correpond place (using the index)
# range begin at 1 to avoid the first column (contains date)
for place in [index[0,i] for i in range(1,date.shape[1]) if date[0,i] != 0]:
print(date[0,0], place)
我知道问题出在哪里,我正在回答我的问题,并对更改发表评论。
import csv,sys
from collections import defaultdict
dd = defaultdict(list)
dateList = []
header = False
def createFile(di):
ff = open("cum_file.csv","w")
csvwriter = csv.writer(ff)
row = []
for d,t in di.iteritems():
for tt in t:
print tt,d
row = [tt,d]
csvwriter.writerow(row)
del row[:]
with open("./try/test.csv") as f:
reader = csv.reader(f,delimiter=",")
for line in reader:
col1 = True
if header:
#sys.exit()
for eachCol in line:
if col1:
col1 = False
tkr = eachCol
elif eachCol != '0':
tkrIndex = line.index(eachCol) #in case of duplicate non - zero values, it is returing the index of first one
tickerDate = dateList[tkrIndex - 1]
dd[tickerDate].append(tkr)
line[tkrIndex] = '' # make the entry blank os duplicate issue will not occur
else:
continue
#print dd
#createFile(dd)
#sys.exit()
else:
header = True
for eachCol in line:
# print line.index(eachCol)
# continue
if col1:
col1 = False
tkr = eachCol
else:
dd[eachCol] = []
dateList.append(eachCol)
print dateList
print dd
createFile(dd)
感谢社区抽出宝贵时间。
import pandas as pd
df = pd.read_csv("input.csv")
df1 = pd.DataFrame(columns=["Date", "Sector"])
for i, row in df.iterrows():
dict_ = dict(row)
days = [key for key, value in dict_.items() if value != 0]
days.remove('Date')
for day in days:
df1.loc[len(df1)] = [day, dict_["Date"]]
# df1.sort_values(by='Date') // to sort by date
df1.to_csv("output.csv", index=False)
Output.csv
Date,Sector
7/6/2007,A
7/3/2007,A
7/5/2007,A
7/3/2007,AA
7/2/2007,AA
7/5/2007,AA
7/9/2007,AAL
7/3/2007,AAL
7/2/2007,AAL
7/5/2007,AAL
7/9/2007,AGCG
7/6/2007,AGCG
7/2/2007,AGCG
7/5/2007,AGCG
7/9/2007,XL
7/6/2007,XL
7/3/2007,XL
7/5/2007,XL
我有一个 CSV 文件,其中第一行是日期,第一列是扇区。像这样。
Date,7/2/2007,7/3/2007,7/5/2007,7/6/2007,7/9/2007
A,0,1,3,2,0
AA,23,423,2,0,0
AAL,34,23,5,0,234
AGCG,234,0,9,234,23
XL,0,65,34,34,34
所以现在我想准备另一个文件,就像
Date,Sector
7/2/2007,AA
7/2/2007,AAL
7/2/2007,AGCG
7/3/2007,A
7/3/2007,AA
7/3/2007,AAL
7/3/2007,XL
...
背后的逻辑是,我希望每个日期的扇区不为 0。
到目前为止我尝试过的代码是: 导入 csv,sys 从集合导入 defaultdict
dd = defaultdict(list)
dateList = []
header = False
def createFile(di):
ff = open("cum_file.csv","w")
csvwriter = csv.writer(ff)
row = []
for d,t in di.iteritems():
for tt in t:
print tt,d
row = [tt,d]
csvwriter.writerow(row)
del row[:]
#with open("./data/StrategyAcctValue-Daily.csv") as f:
with open("./try/test.csv") as f:
reader = csv.reader(f,delimiter=",")
for line in reader:
col1 = True
if header:
#sys.exit()
for eachCol in line:
if col1:
col1 = False
tkr = eachCol
elif eachCol != '0':
tkrIndex = line.index(eachCol)
tickerDate = dateList[tkrIndex - 1]
dd[tickerDate].append(tkr)
else:
continue
#print dd
#createFile(dd)
#sys.exit()
else:
header = True
for eachCol in line:
# print line.index(eachCol)
# continue
if col1:
col1 = False
tkr = eachCol
else:
dd[eachCol] = []
dateList.append(eachCol)
print dateList
print dd
createFile(dd)
这给出了如下输出:
A 7/3/2007
AA 7/3/2007
AAL 7/3/2007
XL 7/3/2007
A 7/6/2007
AAL 7/9/2007
AGCG 7/9/2007
AA 7/2/2007
AAL 7/2/2007
AGCG 7/2/2007
AGCG 7/2/2007
A 7/5/2007
AA 7/5/2007
AAL 7/5/2007
AGCG 7/5/2007
XL 7/5/2007
XL 7/5/2007
XL 7/5/2007
我找不到哪里出错了。
with open("test.csv", 'r') as f:
data = list(zip(*(line.rstrip("\n").split(',') for line in f)))
temp = dict(enumerate(data[0][1:]))
for key, *values in data[1:]:
for index, value in enumerate(values):
if value != '0':
print(key, temp[index])
7/2/2007 AA
7/2/2007 AAL
7/2/2007 AGCG
7/3/2007 A
7/3/2007 AA
7/3/2007 AAL
7/3/2007 XL
7/5/2007 A
7/5/2007 AA
7/5/2007 AAL
7/5/2007 AGCG
7/5/2007 XL
7/6/2007 A
7/6/2007 AGCG
7/6/2007 XL
7/9/2007 AAL
7/9/2007 AGCG
7/9/2007 XL
使用 numpy,您可以将 csv 数据转换为数组(矩阵)。 然后你可以迭代转置矩阵。
import numpy as np
data = np.matrix([['Date','7/2/2007','7/3/2007','7/5/2007','7/6/2007','7/9/2007'],
['A',0,1,3,2,0],
['AA',23,423,2,0,0],
['AAL',34,23,5,0,234],
['AGCG',234,0,9,234,23],
['XL',0,65,34,34,34]])
#get the index of the places in a row (from header)
#data.T is transposed matrix
index = data.T[0]
# you iterate over the dates (row of the transposed matrix) skipping the header row
for date in data.T[1:]:
# get the non-zero element of the row, get the correpond place (using the index)
# range begin at 1 to avoid the first column (contains date)
for place in [index[0,i] for i in range(1,date.shape[1]) if date[0,i] != 0]:
print(date[0,0], place)
我知道问题出在哪里,我正在回答我的问题,并对更改发表评论。
import csv,sys
from collections import defaultdict
dd = defaultdict(list)
dateList = []
header = False
def createFile(di):
ff = open("cum_file.csv","w")
csvwriter = csv.writer(ff)
row = []
for d,t in di.iteritems():
for tt in t:
print tt,d
row = [tt,d]
csvwriter.writerow(row)
del row[:]
with open("./try/test.csv") as f:
reader = csv.reader(f,delimiter=",")
for line in reader:
col1 = True
if header:
#sys.exit()
for eachCol in line:
if col1:
col1 = False
tkr = eachCol
elif eachCol != '0':
tkrIndex = line.index(eachCol) #in case of duplicate non - zero values, it is returing the index of first one
tickerDate = dateList[tkrIndex - 1]
dd[tickerDate].append(tkr)
line[tkrIndex] = '' # make the entry blank os duplicate issue will not occur
else:
continue
#print dd
#createFile(dd)
#sys.exit()
else:
header = True
for eachCol in line:
# print line.index(eachCol)
# continue
if col1:
col1 = False
tkr = eachCol
else:
dd[eachCol] = []
dateList.append(eachCol)
print dateList
print dd
createFile(dd)
感谢社区抽出宝贵时间。
import pandas as pd
df = pd.read_csv("input.csv")
df1 = pd.DataFrame(columns=["Date", "Sector"])
for i, row in df.iterrows():
dict_ = dict(row)
days = [key for key, value in dict_.items() if value != 0]
days.remove('Date')
for day in days:
df1.loc[len(df1)] = [day, dict_["Date"]]
# df1.sort_values(by='Date') // to sort by date
df1.to_csv("output.csv", index=False)
Output.csv
Date,Sector
7/6/2007,A
7/3/2007,A
7/5/2007,A
7/3/2007,AA
7/2/2007,AA
7/5/2007,AA
7/9/2007,AAL
7/3/2007,AAL
7/2/2007,AAL
7/5/2007,AAL
7/9/2007,AGCG
7/6/2007,AGCG
7/2/2007,AGCG
7/5/2007,AGCG
7/9/2007,XL
7/6/2007,XL
7/3/2007,XL
7/5/2007,XL