使用 pandas 将大型 csv 导入数据库
Importing a large csv into DB using pandas
我有一个包含约 300 万条记录的 csv 文件,我想通过我的笔记本电脑(4GB 内存)迁移到 sql 服务器。
pandas
成功将文件读取到 DataFrame (pd.read_csv()
),但是当我尝试迁移 (.to_sql()
) 时,我收到 Memory Error
:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-12-94c146c2b7b7> in <module>()
----> 1 csv.to_sql(name='stats', con=engine, if_exists='append')
C:\Python27\lib\site-packages\pandas\core\generic.pyc in to_sql(self, name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype)
964 self, name, con, flavor=flavor, schema=schema, if_exists=if_exists,
965 index=index, index_label=index_label, chunksize=chunksize,
--> 966 dtype=dtype)
967
968 def to_pickle(self, path):
C:\Python27\lib\site-packages\pandas\io\sql.pyc in to_sql(frame, name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype)
536 pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
537 index_label=index_label, schema=schema,
--> 538 chunksize=chunksize, dtype=dtype)
539
540
C:\Python27\lib\site-packages\pandas\io\sql.pyc in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype)
1170 schema=schema, dtype=dtype)
1171 table.create()
-> 1172 table.insert(chunksize)
1173 # check for potentially case sensitivity issues (GH7815)
1174 if name not in self.engine.table_names(schema=schema or self.meta.schema):
C:\Python27\lib\site-packages\pandas\io\sql.pyc in insert(self, chunksize)
715
716 chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
--> 717 self._execute_insert(conn, keys, chunk_iter)
718
719 def _query_iterator(self, result, chunksize, columns, coerce_float=True,
C:\Python27\lib\site-packages\pandas\io\sql.pyc in _execute_insert(self, conn, keys, data_iter)
689
690 def _execute_insert(self, conn, keys, data_iter):
--> 691 data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
692 conn.execute(self.insert_statement(), data)
693
MemoryError:
还有其他方法可以让我成功迁移吗?
我有一个包含约 300 万条记录的 csv 文件,我想通过我的笔记本电脑(4GB 内存)迁移到 sql 服务器。
pandas
成功将文件读取到 DataFrame (pd.read_csv()
),但是当我尝试迁移 (.to_sql()
) 时,我收到 Memory Error
:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-12-94c146c2b7b7> in <module>()
----> 1 csv.to_sql(name='stats', con=engine, if_exists='append')
C:\Python27\lib\site-packages\pandas\core\generic.pyc in to_sql(self, name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype)
964 self, name, con, flavor=flavor, schema=schema, if_exists=if_exists,
965 index=index, index_label=index_label, chunksize=chunksize,
--> 966 dtype=dtype)
967
968 def to_pickle(self, path):
C:\Python27\lib\site-packages\pandas\io\sql.pyc in to_sql(frame, name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype)
536 pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
537 index_label=index_label, schema=schema,
--> 538 chunksize=chunksize, dtype=dtype)
539
540
C:\Python27\lib\site-packages\pandas\io\sql.pyc in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype)
1170 schema=schema, dtype=dtype)
1171 table.create()
-> 1172 table.insert(chunksize)
1173 # check for potentially case sensitivity issues (GH7815)
1174 if name not in self.engine.table_names(schema=schema or self.meta.schema):
C:\Python27\lib\site-packages\pandas\io\sql.pyc in insert(self, chunksize)
715
716 chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
--> 717 self._execute_insert(conn, keys, chunk_iter)
718
719 def _query_iterator(self, result, chunksize, columns, coerce_float=True,
C:\Python27\lib\site-packages\pandas\io\sql.pyc in _execute_insert(self, conn, keys, data_iter)
689
690 def _execute_insert(self, conn, keys, data_iter):
--> 691 data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
692 conn.execute(self.insert_statement(), data)
693
MemoryError:
还有其他方法可以让我成功迁移吗?