创建用于测试的大文本文件

Create big text file for testing

我正在对文件执行 python ETL。

但是我只有一个小模板。真实文件会有20多GB。

如何将这个小文件复制成一个大文件?只能重复行。


0|18033552000161|032021|PAR_200|21659151780|0|0|C-200|07032021|7252048000136|2||AE370020085702004652088|0|6|11:04:33|1096.14|1||SP|||1048485455|1048485455
1|18033552000161|032022|PAR_200|21650311633|0|0|C-200|07032022|7252048000136|5||CZ1900136063002100728667|0|4|11:04:33|3835.44|1||MG|||56047633650|56047633650
2|18033552000161|032023|PAR_200|21653803883|0|0|C-200|07032023|7252048000136|7||NO2010954004040|0|2|11:04:33|2207.90|1||RJ|||13680102704|13680102704
3|18033552000161|032024|PAR_200|21651787399|0|0|C-200|07032024|7252048000136|11||BE23698700576689|0|2|11:04:33|2752.31|1||SP|||13444926731|13444926731
4|18033552000161|032025|PAR_200|21655452859|0|0|C-200|07032025|7252048000136|10||MU52STRM9362758860940050637YTD|0|7|11:04:33|389.10|1||MG|||40145813657|40145813657
5|18033552000161|032026|PAR_200|21654162541|0|0|C-200|07032026|7252048000136|0||AL154065638669490R4EO0ATK790|0|8|11:04:33|1295.54|1||MG|||3833577800|3833577800
6|18033552000161|032027|PAR_200|21650154230|0|0|C-200|07032027|7252048000136|9||CY063005789014W8962038L23033|0|99|11:04:33|682.84|1||MG|||56171794112|56171794112
7|18033552000161|032028|PAR_200|21653758532|0|0|C-200|07032028|7252048000136|12||TN1140020006380769002385|0|7|11:04:33|4859.10|1||SP|||44457199605|44457199605
8|18033552000161|032029|PAR_200|21650944307|0|0|C-200|07032029|7252048000136|13||MK11604E9J413L98997|0|8|11:04:33|4764.07|1||MG|||34072364096|34072364096
9|18033552000161|032030|PAR_200|21655982923|0|0|C-200|07032030|7252048000136|14||MR4814700801110012154008003|0|99|11:04:33|4180.82|1||MG|||70262241749|70262241749
10|18033552000161|032031|PAR_200|21653346587|0|0|C-200|07032031|7252048000136|1||LI1600081181694N2K346|0|6|11:04:33|1013.84|1||MG|||17651232321|17651232321
11|18033552000161|032032|PAR_200|21652146638|0|0|C-200|07032032|7252048000136|15||HR1850432373024052004|0|7|11:04:33|3893.96|1||MG|||48517833376|48517833376
12|18033552000161|032033|PAR_200|21657570797|0|0|C-200|07032033|7252048000136|17||HR6560061198829650702|0|5|11:04:33|4639.03|1||RJ|||17073431016|17073431016
13|18033552000161|032034|PAR_200|21650803507|0|0|C-200|07032034|7252048000136|18||PS626486200690492007606208242|0|1|11:04:33|3566.18|1||SP|||31163748480|31163748480
14|18033552000161|032035|PAR_200|21653741119|0|0|C-200|07032035|7252048000136|6||BR7700547510010819283490392W3|0|9|11:04:33|4141.09|1||RJ|||37647368443|37647368443

这将获取您的输入行,并以相同的顺序一遍又一遍地重复它们,直到临时字符串的大小为 20 gig。这显然会限制您是否拥有 20gigs RAM 的能力。或者你可以分阶段写入文件,这样你需要更少的内存,或者简单地选择一个更小的大小。


input = """0|18033552000161|032021|PAR_200|21659151780|0|0|C-200|07032021|7252048000136|2||AE370020085702004652088|0|6|11:04:33|1096.14|1||SP|||1048485455|1048485455
1|18033552000161|032022|PAR_200|21650311633|0|0|C-200|07032022|7252048000136|5||CZ1900136063002100728667|0|4|11:04:33|3835.44|1||MG|||56047633650|56047633650
2|18033552000161|032023|PAR_200|21653803883|0|0|C-200|07032023|7252048000136|7||NO2010954004040|0|2|11:04:33|2207.90|1||RJ|||13680102704|13680102704
3|18033552000161|032024|PAR_200|21651787399|0|0|C-200|07032024|7252048000136|11||BE23698700576689|0|2|11:04:33|2752.31|1||SP|||13444926731|13444926731
4|18033552000161|032025|PAR_200|21655452859|0|0|C-200|07032025|7252048000136|10||MU52STRM9362758860940050637YTD|0|7|11:04:33|389.10|1||MG|||40145813657|40145813657
5|18033552000161|032026|PAR_200|21654162541|0|0|C-200|07032026|7252048000136|0||AL154065638669490R4EO0ATK790|0|8|11:04:33|1295.54|1||MG|||3833577800|3833577800
6|18033552000161|032027|PAR_200|21650154230|0|0|C-200|07032027|7252048000136|9||CY063005789014W8962038L23033|0|99|11:04:33|682.84|1||MG|||56171794112|56171794112
7|18033552000161|032028|PAR_200|21653758532|0|0|C-200|07032028|7252048000136|12||TN1140020006380769002385|0|7|11:04:33|4859.10|1||SP|||44457199605|44457199605
8|18033552000161|032029|PAR_200|21650944307|0|0|C-200|07032029|7252048000136|13||MK11604E9J413L98997|0|8|11:04:33|4764.07|1||MG|||34072364096|34072364096
9|18033552000161|032030|PAR_200|21655982923|0|0|C-200|07032030|7252048000136|14||MR4814700801110012154008003|0|99|11:04:33|4180.82|1||MG|||70262241749|70262241749
10|18033552000161|032031|PAR_200|21653346587|0|0|C-200|07032031|7252048000136|1||LI1600081181694N2K346|0|6|11:04:33|1013.84|1||MG|||17651232321|17651232321
11|18033552000161|032032|PAR_200|21652146638|0|0|C-200|07032032|7252048000136|15||HR1850432373024052004|0|7|11:04:33|3893.96|1||MG|||48517833376|48517833376
12|18033552000161|032033|PAR_200|21657570797|0|0|C-200|07032033|7252048000136|17||HR6560061198829650702|0|5|11:04:33|4639.03|1||RJ|||17073431016|17073431016
13|18033552000161|032034|PAR_200|21650803507|0|0|C-200|07032034|7252048000136|18||PS626486200690492007606208242|0|1|11:04:33|3566.18|1||SP|||31163748480|31163748480
14|18033552000161|032035|PAR_200|21653741119|0|0|C-200|07032035|7252048000136|6||BR7700547510010819283490392W3|0|9|11:04:33|4141.09|1||RJ|||37647368443|37647368443""".split("\n")

parsed_input = [line.split("|") for line in input]

output_content = ""

output_file = "some_big_file.txt"

num_bytes_size = 20 * 1000 * 1000 * 1000  # 20 gigs 

counter = 0
while len(output_content) < num_bytes_size:
    temp = f"{counter}|{'|'.join(parsed_input[counter % len(parsed_input)][1:])}\n"
    print(temp)
    output_content += temp
    counter += 1

with open(output_file, "w") as f:
    f.write(output_content)


Ex输出尾部:

99|18033552000161|032030|PAR_200|21655982923|0|0|C-200|07032030|7252048000136|14||MR4814700801110012154008003|0|99|11:04:33|4180.82|1||MG|||70262241749|70262241749
100|18033552000161|032031|PAR_200|21653346587|0|0|C-200|07032031|7252048000136|1||LI1600081181694N2K346|0|6|11:04:33|1013.84|1||MG|||17651232321|17651232321
101|18033552000161|032032|PAR_200|21652146638|0|0|C-200|07032032|7252048000136|15||HR1850432373024052004|0|7|11:04:33|3893.96|1||MG|||48517833376|48517833376
102|18033552000161|032033|PAR_200|21657570797|0|0|C-200|07032033|7252048000136|17||HR6560061198829650702|0|5|11:04:33|4639.03|1||RJ|||17073431016|17073431016
103|18033552000161|032034|PAR_200|21650803507|0|0|C-200|07032034|7252048000136|18||PS626486200690492007606208242|0|1|11:04:33|3566.18|1||SP|||31163748480|31163748480
104|18033552000161|032035|PAR_200|21653741119|0|0|C-200|07032035|7252048000136|6||BR7700547510010819283490392W3|0|9|11:04:33|4141.09|1||RJ|||37647368443|37647368443
105|18033552000161|032021|PAR_200|21659151780|0|0|C-200|07032021|7252048000136|2||AE370020085702004652088|0|6|11:04:33|1096.14|1||SP|||1048485455|1048485455
106|18033552000161|032022|PAR_200|21650311633|0|0|C-200|07032022|7252048000136|5||CZ1900136063002100728667|0|4|11:04:33|3835.44|1||MG|||56047633650|56047633650
107|18033552000161|032023|PAR_200|21653803883|0|0|C-200|07032023|7252048000136|7||NO2010954004040|0|2|11:04:33|2207.90|1||RJ|||13680102704|13680102704
108|18033552000161|032024|PAR_200|21651787399|0|0|C-200|07032024|7252048000136|11||BE23698700576689|0|2|11:04:33|2752.31|1||SP|||13444926731|13444926731
109|18033552000161|032025|PAR_200|21655452859|0|0|C-200|07032025|7252048000136|10||MU52STRM9362758860940050637YTD|0|7|11:04:33|389.10|1||MG|||40145813657|40145813657
110|18033552000161|032026|PAR_200|21654162541|0|0|C-200|07032026|7252048000136|0||AL154065638669490R4EO0ATK790|0|8|11:04:33|1295.54|1||MG|||3833577800|3833577800
111|18033552000161|032027|PAR_200|21650154230|0|0|C-200|07032027|7252048000136|9||CY063005789014W8962038L23033|0|99|11:04:33|682.84|1||MG|||56171794112|56171794112
112|18033552000161|032028|PAR_200|21653758532|0|0|C-200|07032028|7252048000136|12||TN1140020006380769002385|0|7|11:04:33|4859.10|1||SP|||44457199605|44457199605
113|18033552000161|032029|PAR_200|21650944307|0|0|C-200|07032029|7252048000136|13||MK11604E9J413L98997|0|8|11:04:33|4764.07|1||MG|||34072364096|34072364096
114|18033552000161|032030|PAR_200|21655982923|0|0|C-200|07032030|7252048000136|14||MR4814700801110012154008003|0|99|11:04:33|4180.82|1||MG|||70262241749|70262241749
115|18033552000161|032031|PAR_200|21653346587|0|0|C-200|07032031|7252048000136|1||LI1600081181694N2K346|0|6|11:04:33|1013.84|1||MG|||17651232321|17651232321
116|18033552000161|032032|PAR_200|21652146638|0|0|C-200|07032032|7252048000136|15||HR1850432373024052004|0|7|11:04:33|3893.96|1||MG|||48517833376|48517833376
117|18033552000161|032033|PAR_200|21657570797|0|0|C-200|07032033|7252048000136|17||HR6560061198829650702|0|5|11:04:33|4639.03|1||RJ|||17073431016|17073431016
118|18033552000161|032034|PAR_200|21650803507|0|0|C-200|07032034|7252048000136|18||PS626486200690492007606208242|0|1|11:04:33|3566.18|1||SP|||31163748480|31163748480
119|18033552000161|032035|PAR_200|21653741119|0|0|C-200|07032035|7252048000136|6||BR7700547510010819283490392W3|0|9|11:04:33|4141.09|1||RJ|||37647368443|37647368443
120|18033552000161|032021|PAR_200|21659151780|0|0|C-200|07032021|7252048000136|2||AE370020085702004652088|0|6|11:04:33|1096.14|1||SP|||1048485455|1048485455
121|18033552000161|032022|PAR_200|21650311633|0|0|C-200|07032022|7252048000136|5||CZ1900136063002100728667|0|4|11:04:33|3835.44|1||MG|||56047633650|56047633650
122|18033552000161|032023|PAR_200|21653803883|0|0|C-200|07032023|7252048000136|7||NO2010954004040|0|2|11:04:33|2207.90|1||RJ|||13680102704|13680102704
123|18033552000161|032024|PAR_200|21651787399|0|0|C-200|07032024|7252048000136|11||BE23698700576689|0|2|11:04:33|2752.31|1||SP|||13444926731|13444926731
124|18033552000161|032025|PAR_200|21655452859|0|0|C-200|07032025|7252048000136|10||MU52STRM9362758860940050637YTD|0|7|11:04:33|389.10|1||MG|||40145813657|40145813657
125|18033552000161|032026|PAR_200|21654162541|0|0|C-200|07032026|7252048000136|0||AL154065638669490R4EO0ATK790|0|8|11:04:33|1295.54|1||MG|||3833577800|3833577800