pandas 大文件操作

常规的读取大文件的步骤

import pandas as pd

f = open('./data/ows-raw.txt',encoding='utf-8')
reader = pd.read_table(f,  sep=',',  iterator=True, error_bad_lines=False) #跳过报错行
loop = True
chunkSize = 100000
chunks = []


while loop:
    try:
        chunk = reader.get_chunk(chunkSize)
        chunks.append(chunk)
    except StopIteration:
        loop = False
        print("Iteration is stopped.")


df = pd.concat(chunks, ignore_index=True)

STORY

这几天有一个需求是读取.dta文件并转为.csv，google了一下发现pandas也是支持dta格式的

于是直接开写，20行搞定

然而事情并没有那么简单…

read_stata方法就直接抛出ValueError了：

又Google了一下，github issues上没有解决了的，stackoverflow里倒是有提议，但貌似不是抛出这个error

解决

无奈还是自己去读源码了，发现StataReader的get_chunk方法貌似在不给出chunksize时不能默认读取全部，无奈只能采用了下面的方法二分chunksize直到读取完毕：

import pandas as pd
import os
import re


target_path = './data/excel/{}.csv'


def dta_to_excel(origin_path):
    CHUNKSIZE = 2000
    reader = pd.read_stata(origin_path, iterator=True)
    file_name = re.sub(r'\.dta', '', origin_path.split('/')[-1])
    print('{} translate start'.format(file_name))

    chunks = []
    while CHUNKSIZE > 0:
        try:
            print('Will get {} lines'.format(CHUNKSIZE))
            chunk = reader.get_chunk(CHUNKSIZE)
            chunks.append(chunk)
        except ValueError:
            print('CHUNKSIZE too large')
            CHUNKSIZE //= 2
    
    df = pd.concat(chunks, ignore_index=True)
    df.to_csv(target_path.format(file_name))
    print('{} translated done'.format(file_name))


if __name__ == '__main__':
    origin_dir = './data/origin'
    #  os.listdir:列出目标路径下的所有文件（文件夹）
    for path in os.listdir(origin_dir):
        dta_to_excel(os.path.join(origin_dir, path))