chunksize = 10 ** 6
with pd.read_csv(filename, chunksize=chunksize) as reader:
for chunk in reader:
process(chunk)
Помогает бороться с MemoryError
chunksize = 10 ** 6
with pd.read_csv(filename, chunksize=chunksize) as reader:
for chunk in reader:
process(chunk)
Помогает бороться с MemoryError
# -*- coding: utf-8 -*-
'''
на входе папка или файл, на выходе CSV: имя файла;сумма мд5;папка полностью;имя компа
'''
import os
import hashlib
import platform
import time
import sys
class MyItem:
def __init__(self, f):
self.hostname = hostname
self.md5sum = md5(f)
self.size = os.path.getsize(f)
self.created = "%s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getctime(f)))
self.modified = "%s" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(f)))
self.ext = os.path.splitext(f)[1]
self.full_dir, self.file_name = os.path.split(os.path.abspath(f))
def md5(fname): # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
hostname = platform.node()
result_header = "file_name;md5sum;size;created;modified;ext;full_path;hostname\n"
result_file = open('./result.csv','w')
result_file.write(result_header)
targets = []
if len(sys.argv) > 1:
targets = sys.argv[1::]
else:
targets.append(".")
for t in targets:
if os.path.exists(t):
if os.path.isdir(t):
for current_dir, dirs, files in os.walk(t):
for f in files:
try:
i = os.path.join(current_dir, f)
print(i)
item = MyItem(i)
result_file.write('%s;%s;%s;%s;%s;%s;%s;%s\n'%(item.file_name, item.md5sum, item.size, item.created, item.modified, item.ext, item.full_dir, item.hostname))
except:
print('Error!')
if os.path.isfile(t):
try:
i = t
print(i)
item = MyItem(i)
result_file.write('%s;%s;%s;%s;%s;%s;%s;%s\n'%(item.file_name, item.md5sum, item.size, item.created, item.modified, item.ext, item.full_dir, item.hostname))
except:
print('Error!')
result_file.close()