I'm new to Python and to Pandas, and I am desperately trying to understand how or why this is happening.
I have a CSV file with some data, which has some rows which have extra commas , which are not escaped. So there are 4 column headers, but some rows have 5 fields due to the improperly escaped commas ,.
data.csv:
Index,First Name,Middle Name,Last Name
1,Mr. Al\, B.,grüBen,Johnson
2,"Mr. Al\, B.",grüBen,Johnson
3,\"Mr. Al\, B.\",grüBen,Johnson
4,Mr. Al\, B.,grüBen,Johnson
I want to read this CSV directly into a Panda dataframe. My expectation is that Panda should throw or warn me about the data being inconsistent with the header column, but instead it does a very strange thing where it seems to drop the first values in each row, which would have been the index. The code and output will illustrate better than I can with words.
main.py:
import csv
import pandas as pd
def main():
file = "data.csv"
print_block("Read STD/CSV")
read_csv_std(file)
print_block("Validate STD/CSV")
validate_csv_std(file)
print_block("Read PANDAS default")
read_csv_pandas(file)
print_block("Read PANDAS with provided headers")
read_csv_pandas_with_provided_headers(file)
print_block("Read PANDAS with pyarrow engine")
read_csv_pandas_with_pyarrow_engine(file)
print_block("Validate PANDAS by type casting")
validate_csv_pandas_by_casting(file)
def print_block(text):
print(f"====== {text} ======")
def read_csv_std(path):
with open(path, newline="") as file:
reader = csv.reader(file)
for i, row in enumerate(reader):
print(f"i={i}, len={len(row)} -> {row}")
def validate_csv_std(path):
with open(path, newline="") as file:
reader = csv.reader(file)
headers = next(reader)
num_columns = len(headers)
for i, row in enumerate(reader, start=1):
if len(row) != num_columns:
print(
f"i={i} - ❌ - expected {num_columns} fields, saw {len(row)} -> {row}"
)
else:
print(f"i={i} - ✅ - expected {num_columns} fields, saw {len(row)}")
def read_csv_pandas(path):
df = pd.read_csv(
path,
on_bad_lines="error", # does nothing whether 'warn' or 'skip' - silently moves the columns around - see logs
)
headers = df.columns.to_list()
print(f"i=0, len={len(headers)} -> {headers}")
for i, row in df.iterrows():
values = row.tolist()
print(f"i={i}, len={len(values)} -> {values}")
def read_csv_pandas_with_provided_headers(path):
with open(path, newline="") as file:
reader = csv.reader(file)
headers = next(reader)
print(f"i=0, len={len(headers)} -> {headers}")
df = pd.read_csv(
path,
names=headers, # works but we end have to read the csv upfront, and the column ends up as a row in the df
on_bad_lines="skip",
)
for i, row in df.iterrows():
values = row.tolist()
print(f"i={i}, len={len(values)} -> {values}")
def read_csv_pandas_with_pyarrow_engine(path):
df = pd.read_csv(
path,
engine="pyarrow", # this gives the desired result, but not fully sure of the implications of switching
on_bad_lines="skip",
)
headers = df.columns.to_list()
print(f"i=0, len={len(headers)} -> {headers}")
for i, row in df.iterrows():
values = row.tolist()
print(f"i={i}, len={len(values)} -> {values}")
def validate_csv_pandas_by_casting(path):
pd.read_csv(
path,
converters={ "Index": validated_int },
)
def validated_int(x: str) -> int:
return int(x) # pandas will raise a ValueError if this isn't an int
main()
Here is the output from the program:
====== Read STD/CSV ======
i=0, len=4 -> ['Index', 'First Name', 'Middle Name', 'Last Name']
i=1, len=5 -> ['1', 'Mr. Al\\', ' B.', 'grüBen', 'Johnson']
i=2, len=4 -> ['2', 'Mr. Al\\, B.', 'grüBen', 'Johnson']
i=3, len=5 -> ['3', '\\"Mr. Al\\', ' B.\\"', 'grüBen', 'Johnson']
i=4, len=5 -> ['4', 'Mr. Al\\', ' B.', 'grüBen', 'Johnson']
====== Validate STD/CSV ======
i=1 - ❌ - expected 4 fields, saw 5 -> ['1', 'Mr. Al\\', ' B.', 'grüBen', 'Johnson']
i=2 - ✅ - expected 4 fields, saw 4
i=3 - ❌ - expected 4 fields, saw 5 -> ['3', '\\"Mr. Al\\', ' B.\\"', 'grüBen', 'Johnson']
i=4 - ❌ - expected 4 fields, saw 5 -> ['4', 'Mr. Al\\', ' B.', 'grüBen', 'Johnson']
====== Read PANDAS default ======
i=0, len=4 -> ['Index', 'First Name', 'Middle Name', 'Last Name']
i=1, len=4 -> ['Mr. Al\\', ' B.', 'grüBen', 'Johnson']
i=2, len=4 -> ['Mr. Al\\, B.', 'grüBen', 'Johnson', nan]
i=3, len=4 -> ['\\"Mr. Al\\', ' B.\\"', 'grüBen', 'Johnson']
i=4, len=4 -> ['Mr. Al\\', ' B.', 'grüBen', 'Johnson']
====== Read PANDAS with provided headers ======
i=0, len=4 -> ['Index', 'First Name', 'Middle Name', 'Last Name']
i=0, len=4 -> ['Index', 'First Name', 'Middle Name', 'Last Name']
i=1, len=4 -> ['2', 'Mr. Al\\, B.', 'grüBen', 'Johnson']
====== Read PANDAS with pyarrow engine ======
i=0, len=4 -> ['Index', 'First Name', 'Middle Name', 'Last Name']
i=0, len=4 -> [2, 'Mr. Al\\, B.', 'grüBen', 'Johnson']
====== Validate PANDAS by type casting ======
Traceback (most recent call last):
File "/Users/cillian/git/python/personal/python-playground/01_read_csv/main.py", line 97, in <module>
main()
File "/Users/cillian/git/python/personal/python-playground/01_read_csv/main.py", line 18, in main
validate_csv_pandas_by_casting(file)
File "/Users/cillian/git/python/personal/python-playground/01_read_csv/main.py", line 87, in validate_csv_pandas_by_casting
pd.read_csv(
File "/Users/cillian/git/python/personal/python-playground/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
return _read(filepath_or_buffer, kwds)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/cillian/git/python/personal/python-playground/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 626, in _read
return parser.read(nrows)
^^^^^^^^^^^^^^^^^^
File "/Users/cillian/git/python/personal/python-playground/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1923, in read
) = self._engine.read( # type: ignore[attr-defined]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/cillian/git/python/personal/python-playground/.venv/lib/python3.12/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 234, in read
chunks = self._reader.read_low_memory(nrows)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pandas/_libs/parsers.pyx", line 838, in pandas._libs.parsers.TextReader.read_low_memory
File "pandas/_libs/parsers.pyx", line 921, in pandas._libs.parsers.TextReader._read_rows
File "pandas/_libs/parsers.pyx", line 1045, in pandas._libs.parsers.TextReader._convert_column_data
File "pandas/_libs/parsers.pyx", line 2116, in pandas._libs.parsers._apply_converter
File "/Users/cillian/git/python/personal/python-playground/01_read_csv/main.py", line 94, in validated_int
return int(x) # pandas will raise a ValueError if this isn't an int
^^^^^^
ValueError: invalid literal for int() with base 10: 'Mr. Al\\'
Can anyone explain why this is happening? Am I holding it wrong? Should Pandas throw/warn, or just silently massage the data like it does?
Edit:
I should have added what I want/expect to happen. My expectation is that I should be able to get Pandas to error/warn/skip without an extra read of the csv.
So the output of my PyArrow approach is exactly what I want, but I don't think I can move to PyArrow as the engine for Pandas since I also need to make use of chunking in Pandas.
I guess I could move over to PyArrow and convert to Pandas:
def read_csv_pyarrow(path):
table = pacsv.read_csv(
path,
parse_options=pacsv.ParseOptions(
invalid_row_handler=skip_handler
),
read_options=pacsv.ReadOptions(
block_size=50,
),
)
df = table.to_pandas()
print(df)
def read_csv_pyarrow_incremental(path):
stream = pacsv.open_csv(
path,
parse_options=pacsv.ParseOptions(
invalid_row_handler=skip_handler
),
read_options=pacsv.ReadOptions(
block_size=50,
),
)
df = stream.read_pandas()
print(df)
def skip_handler(invalid_row):
print(invalid_row)
return "skip"
Which produces:
====== Read PYARROW default ======
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='1,Mr. Al\\, B.,grüBen,Johnson')
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='3,\\"Mr. Al\\, B.\\",grüBen,Johnson')
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='4,Mr. Al\\, B.,grüBen,Johnson')
Index First Name Middle Name Last Name
0 2 Mr. Al\, B. grüBen Johnson
====== Read PYARROW incremental ======
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='1,Mr. Al\\, B.,grüBen,Johnson')
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='3,\\"Mr. Al\\, B.\\",grüBen,Johnson')
InvalidRow(expected_columns=4, actual_columns=5, number=None, text='4,Mr. Al\\, B.,grüBen,Johnson')
Index First Name Middle Name Last Name
0 2 Mr. Al\, B. grüBen Johnson
Or maybe I should jump straight to Polars? I cannot seem to get it to skip a row with the wrong number of rows like PyArrow does:
def read_csv_polars(path):
df = pl.read_csv(
path,
columns=["Index", "First Name", "Middle Name", "Last Name"],
use_pyarrow=True,
infer_schema=False,
ignore_errors=True,
)
print(df)
Which throws:
====== Read POLARS default ======
Traceback (most recent call last):
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/01_read_csv/main.py", line 151, in <module>
main()
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/01_read_csv/main.py", line 33, in main
read_csv_polars(file)
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/01_read_csv/main.py", line 126, in read_csv_polars
df = pl.read_csv(
^^^^^^^^^^^^
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py", line 128, in wrapper
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py", line 128, in wrapper
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py", line 128, in wrapper
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/cillian.myles/git/github.com/CillianMyles/python-playground/.venv/lib/python3.12/site-packages/polars/io/csv/functions.py", line 334, in read_csv
tbl = pa.csv.read_csv(
^^^^^^^^^^^^^^^^
File "pyarrow/_csv.pyx", line 1260, in pyarrow._csv.read_csv
File "pyarrow/_csv.pyx", line 1269, in pyarrow._csv.read_csv
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: CSV parse error: Expected 4 columns, got 5: 1,Mr. Al\, B.,grüBen,Johnson
df.indexto the values (and not drop them completely). If I doindex_col=FalseI get the intactIndexcolumn and it prints a warning.index_col. It treats first column as "labels", not column with headerIndex. So it assigns values in second column to headerIndex, etc. If you useindex_col=Falsethen it treats first column as normal column and assigns to headerIndex(and it creates labels on its own)duckdb.sql("from read_csv('data.csv', ignore_errors=true)")- You can use.df()or.pl()to get a DataFrame, although depending on why Pandas chunking is required you may want to outsource all the work to DuckDB.