follow the python script:
import pandas as pd
from typing import Optional, List, Tuple, Dict, Union
def validate_dataframe(df: pd.DataFrame,
n_cols: Optional[int] = None,
n_rows: Optional[Tuple[int, int]] = None,
columns: Optional[List[str]] = None,
column_types: Optional[Dict[str, type]] = None,
check_duplicates: Optional[bool] = False,
check_null_values: Optional[bool] = False,
unique_columns: Optional[List[str]] = None,
column_ranges: Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]] = None,
date_columns: Optional[List[str]] = None,
categorical_columns: Optional[Dict[str, List[Union[str, int, float]]]] = None
) -> Tuple[bool, str]:
"""
Validates a Pandas DataFrame based on specified criteria.
Parameters:
- df (pd.DataFrame): The DataFrame to be validated.
- n_cols (int, optional): Number of expected columns in the DataFrame.
- n_rows (tuple, optional): Tuple (min_rows, max_rows) specifying the expected range of rows.
- columns (list, optional): List of column names that should be present in the DataFrame.
- column_types (dict, optional): Dictionary mapping column names to the expected data types.
- check_duplicates (bool, optional): Check for the presence of duplicate rows in the DataFrame.
- check_null_values (bool, optional): Check for the presence of null values in the DataFrame.
- unique_columns (list, optional): List of columns that should have only unique values.
- column_ranges (dict, optional): Dictionary mapping numeric columns to the allowed ranges.
- date_columns (list, optional): List of columns containing date values to validate the format.
- categorical_columns (dict, optional): Dictionary mapping categorical columns to allowed values.
Returns:
- tuple: (bool, str) indicating success or failure, and an optional description of the problem.
"""
# Validate number of columns
if n_cols is not None and len(df.columns) != n_cols:
return (False, f"Error: Expected {n_cols} columns, but found {len(df.columns)} columns.")
# Validate row range
if n_rows is not None:
min_rows, max_rows = n_rows
if not (min_rows <= len(df) <= max_rows):
return (False, f"Error: Number of rows should be between {min_rows} and {max_rows}.")
# Validate columns
if columns is not None:
if not set(columns).issubset(df.columns):
missing_columns = set(columns) - set(df.columns)
return (False, f"Error: Missing columns: {missing_columns}.")
# Validate column types
if column_types is not None:
if set(column_types.keys()).issubset(df.columns):
for col, expected_type in column_types.items():
if not df[col].dtype == expected_type:
return (False, f"Error: Column '{col}' should have type {expected_type}.")
else:
return (False, "Error: Some columns in column_types are not present in the DataFrame.")
# Validate duplicates ££ ONLY SPECIFIC COLUMNS
if check_duplicates and df.duplicated().any():
return (False, "Duplicates found in the DataFrame.")
# Validate null values ££ ONLY SPECIFIC COLUMNS
if check_null_values and df.isnull().any().any():
return (False, "DataFrame contains null values.")
# Validate unique values in specific columns
if unique_columns is not None:
if set(unique_columns).issubset(df.columns):
if df[unique_columns].nunique().ne(df[unique_columns].count()).any():
return (False, "Some columns should have only unique values.")
# Validate values in a specific range
if column_ranges is not None:
for col, value_range in column_ranges.items():
if col in df.columns and not df[col].between(*value_range).all():
return (False, f"Values in '{col}' should be between {value_range[0]} and {value_range[1]}.")
# Validate date format (assuming 'date_columns' are date columns)
if date_columns is not None:
for col in date_columns:
if col in df.columns:
try:
pd.to_datetime(df[col], errors='raise')
except ValueError:
return (False, f"'{col}' should be in a valid date format.")
# Validate categorical values
if categorical_columns is not None:
for col, allowed_values in categorical_columns.items():
if col in df.columns and not df[col].isin(allowed_values).all():
return (False, f"Values in '{col}' should be {allowed_values}.")
# If all validations pass, return True
return (True, "DataFrame has passed all validations.")