Out of memory trying to convert csv file to parquet using python

Question

I am trying to convert a very large csv file to parquet.

I have tried the following method:

df1 = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv')
df1.to_parquet('/kaggle/input/amex-default-prediction/train.parquet')

but pd.read_csv throws Out Of Memory Error

Is there any way to convert to the file without loading it entirely ?

I think you need to read the csv using pyarrow (maybe with the open_csv() method due to memory constraints) and then write it as a parquet "partitioned dataset" using the pyarrow.parquet.write_to_dataset() method. You can get some info here: arrow.apache.org/docs/python/… — Guillermo.D
– Guillermo.D, Commented Jun 29, 2022 at 3:58

Okorimi Manoury · Accepted Answer · 2022-07-04 15:11:42Z

To solve the memory problem, you can first import the data with the chunck method of pandas and save each chunck as a parquet file. So for example for your case, create a folder "train_data", and in this folder you save the different parquet files that correspond to the chuncks.

import pandas as pd
import numpy as np
import os
import csv2parquet
from subprocess import run
import fastparquet
import sys
import pyarrow.parquet as pq

path ="C:/.../amex-default-prediction/"
parquet="parquet/"
#create new folder train_data
path_train_data="train_data/"


def get_path_parquet(file):
    if file.split('.')[0]=="sample_submission":
        return path_sample_submission
    elif file.split('.')[0]=="test_data":
        return path_test_data
    elif file.split('.')[0]=="train_data":
        return path_train_data
    elif file.split('.')[0]=="train_labels":
        return path_train_label
def csv_to_parquet(df,title, path,i):
    """
    Convert Csv files to parquet
    df    : csv data
    title : name data
    path  : folder into the save parquet data
    """
    try:
        title_prefix=title.split(".")[0]+str(i)
        out_title = path + f'\\{title_prefix}.parquet'
        df.to_parquet(out_title, engine='fastparquet')
    except:
        sys.exit(-1)
def loding_csv_with_chunk(path,file):
    try:
        chunk_csv= pd.read_csv(path + f'\\{file}', low_memory=False, chunksize = 5000)
        #df = pd.concat(chunk for chunk in chunk_csv)
        return chunk_csv
    except:
        sys.exit(-1)
        

def read_partition_parquet():
    dataset = pq.ParquetDataset(path_train_, use_legacy_dataset=False)
    data=dataset.read().to_pandas()
    return data


#csv_df
for file in os.listdir(path):
    if file[-4:]==".csv":
        
        print("begin process for : "+str(file)+ "....")
        #csv_df = pd.read_csv(path + f'\\{file}')
        ##load data with chunck method
        chunk_csv = loding_csv_with_chunk(path,file)
        ##for each chunck save the data on parquet format 
        for i, df_chunk in enumerate(chunk_csv):
            print(df_chunk.shape)
            title_prefix=file.split(".")[0]+str(i)
            out_title = path+parquet+get_path_parquet(file) + f'{title_prefix}.parquet'
            df_chunk.to_parquet(out_title, engine='fastparquet')
        #csv_to_parquet(csv_df,file, path)
        print("end process for : "+str(file)+ "....")
    else:
        continue

Collectives™ on Stack Overflow

Out of memory trying to convert csv file to parquet using python

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related