Source code for ssapy_toolkit.IO.csv_utils

from typing import List, Dict, Optional, Union
from pandas import read_csv as pd_read_csv, DataFrame, errors, concat
from .guess_delimiter import guess_csv_delimiter
import numpy as np
import csv
import os
from .get_memory import get_memory_usage
from .io_utils import exists


[docs] def read_csv(file_name: str, sep: Optional[str] = None, dtypes: Optional[Dict[str, Union[str, np.dtype]]] = None, col: Union[bool, List[str], None] = False, to_np: bool = False, drop_nan: bool = False, skiprows: List[int] = []) -> Union[DataFrame, np.ndarray]: """ Read a CSV file with options. Args: file_name (str): The path to the CSV file. sep (Optional[str]): The delimiter used in the CSV file. dtypes (Optional[Dict[str, Union[str, np.dtype]]]): Dictionary specifying data types for columns. col (Union[bool, List[str], None]): Specify columns to read. to_np (bool): Convert the loaded data to a NumPy array. drop_nan (bool): Drop rows with missing values. skiprows (List[int]): Rows to skip while reading the CSV file. Returns: Union[pd.DataFrame, np.ndarray]: A DataFrame or NumPy array with the loaded data. Author: Travis Yeager (yeager7@llnl.gov) """ if col and not isinstance(col, list): col = [col] # Ensure col is always a list if sep is None: sep = guess_csv_delimiter(file_name) # Guess the delimiter if col is False: try: df = pd_read_csv(file_name, sep=sep, on_bad_lines='skip', skiprows=skiprows, dtype=dtypes) except TypeError: df = pd_read_csv(file_name, sep=sep, skiprows=skiprows, dtype=object) else: try: if not isinstance(col, list): col = [col] df = pd_read_csv(file_name, sep=sep, usecols=col, on_bad_lines='skip', skiprows=skiprows, dtype=dtypes) except TypeError: df = pd_read_csv(file_name, sep=sep, usecols=col, skiprows=skiprows, dtype=object) if drop_nan: df = df.dropna() if to_np: return np.squeeze(df.to_numpy()) else: return df
[docs] def makedf(df: Union[DataFrame, List, Dict]) -> DataFrame: """ Converts a list or dictionary into a Pandas DataFrame. Args: df (Union[pd.DataFrame, List, Dict]): A DataFrame, list, or dictionary. Returns: pd.DataFrame: A DataFrame created from the input. Author: Travis Yeager (yeager7@llnl.gov) """ if isinstance(df, (list, dict)): return DataFrame.from_dict(df) else: return df
[docs] def save_csv(file_name: str, df: DataFrame, sep: str = ',', dtypes: Optional[Dict[str, Union[str, np.dtype]]] = None) -> None: """ Save a Pandas DataFrame to a CSV file. Args: file_name (str): The path to the CSV file. df (pd.DataFrame): The Pandas DataFrame to save. sep (str): The delimiter used in the CSV file. dtypes (Optional[Dict[str, Union[str, np.dtype]]]): A dictionary specifying data types for columns. Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ df = makedf(df) if dtypes: df = df.astype(dtypes) df.to_csv(file_name, index=False, sep=sep) print(f'Saved {file_name} successfully.') return
[docs] def read_csv_header(file_name: str, sep: Optional[str] = None) -> List[str]: """ Get the header of a CSV file. Args: file_name (str): The filename of the CSV file. sep (Optional[str]): The delimiter used in the CSV file. Returns: List[str]: A list of the header fields. Author: Travis Yeager (yeager7@llnl.gov) """ if sep is None: sep = guess_csv_delimiter(file_name) # Guess the delimiter with open(file_name, 'r') as infile: reader = csv.DictReader(infile, delimiter=sep) fieldnames = reader.fieldnames return fieldnames
[docs] def save_csv_header(filename: str, header: List[str], delimiter: str = ',') -> None: """ Saves a header row to a CSV file with a specified delimiter. Args: filename (str): The name of the file where the header will be saved. header (List[str]): A list of strings representing the column names. delimiter (str, optional): The delimiter to use between columns in the CSV file. Default is comma (','). Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ with open(filename, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=delimiter) writer.writerow(header)
[docs] def append_csv(file_names: List[str], save_path: str = 'combined_data.csv', sep: str = ',', dtypes: Optional[Dict[str, Union[str, np.dtype]]] = None, progress: Optional[callable] = None) -> None: """ Appends multiple CSV files into a single CSV file. Args: file_names (List[str]): A list of CSV file names. save_path (str): The path to the output CSV file. If not specified, the output will be saved to the current working directory. sep (Optional[str]): The delimiter used in the CSV files. If None, delimiter will be guessed. dtypes (Optional[Dict[str, Union[str, np.dtype]]]): A dictionary specifying data types for columns. progress (Optional[callable]): A function that can be used to track progress (e.g., printing memory usage). Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ error_files = [] dataframes = [] for i, file in enumerate(file_names): try: df = pd_read_csv(file, sep=guess_csv_delimiter(file)) dataframes.append(df) if progress is not None: get_memory_usage() print(f"Appended {i+1} of {len(file_names)}. File: {file}") except (FileNotFoundError, errors.EmptyDataError, errors.ParserError) as e: error_files.append(file) print(f"Error processing file {file}: {e}") combined_df = concat(dataframes, ignore_index=True) if dtypes: combined_df = combined_df.astype(dtypes) if save_path: combined_df.to_csv(save_path, sep=sep, index=False) else: combined_df.to_csv('combined_data.csv', sep=sep, index=False) print(f'The final dataframe has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns.') if error_files: print(f'The following files ERRORED and were not included: {error_files}') return
[docs] def append_csv_on_disk(csv_files: List[str], output_file: str) -> None: """ Appends multiple CSV files directly to a single output file on disk. Args: csv_files (List[str]): A list of CSV files to append. output_file (str): The path to the output CSV file. Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ delimiter = guess_csv_delimiter(csv_files[0]) # Open the output file for writing with open(output_file, 'w', newline='') as outfile: writer = csv.writer(outfile, delimiter=delimiter) # Write the header row from the first CSV file with open(csv_files[0], 'r', newline='') as first_file: reader = csv.reader(first_file, delimiter=delimiter) header = next(reader) writer.writerow(header) # Write the data rows from the first CSV file for row in reader: writer.writerow(row) # Write the data rows from the remaining CSV files for file in csv_files[1:]: with open(file, 'r', newline='') as infile: reader = csv.reader(infile, delimiter=delimiter) next(reader) # Skip the header row for row in reader: writer.writerow(row) print(f'Completed appending of: {output_file}.')
[docs] def append_dict_to_csv(file_name: str, data_dict: Dict[str, List[Union[str, float, int]]], delimiter: str = ',') -> None: """ Appends data from a dictionary to a CSV file. Args: file_name (str): The path to the CSV file. data_dict (Dict[str, List[Union[str, float, int]]]): A dictionary where keys are column names and values are lists of column data. delimiter (str): The delimiter used in the CSV file. Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ # Check if the input is a numpy array or DataFrame, and convert to dictionary if necessary if isinstance(data_dict, np.ndarray): # Convert ndarray to dictionary (assuming each column is a field) data_dict = {f'col{i}': data_dict[:, i].tolist() for i in range(data_dict.shape[1])} elif isinstance(data_dict, DataFrame): # Convert DataFrame to dictionary (using columns as keys) data_dict = data_dict.to_dict(orient='list') # Extract keys and values from the dictionary keys = list(data_dict.keys()) values = list(data_dict.values()) # Determine the length of the arrays array_length = len(values[0]) # Determine if the file exists file_exists = os.path.exists(file_name) # Open the CSV file in append mode with open(file_name, 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=delimiter) # Write header if the file doesn't exist if not file_exists: writer.writerow(keys) # Write each element from arrays as a new row for i in range(array_length): row = [values[j][i] for j in range(len(keys))] writer.writerow(row)
[docs] def save_csv_array_to_line(filename: str, array: List[Union[str, float, int]], delimiter: str = ',') -> None: """ Appends a single row of data to a CSV file with a specified delimiter. Args: filename (str): The name of the file to which the row will be appended. array (List[Union[str, float, int]]): A list of values representing a single row of data to be appended to the CSV file. delimiter (str, optional): The delimiter to use between columns in the CSV file. Default is comma (','). Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ with open(filename, 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=delimiter) writer.writerow(array)
[docs] def save_csv_line(file_name: str, df: DataFrame, sep: str = ',', dtypes: Optional[Dict[str, Union[str, np.dtype]]] = None) -> None: """ Save a Pandas DataFrame to a CSV file, appending the DataFrame to the file if it exists. Args: file_name (str): The path to the CSV file. df (pd.DataFrame): The Pandas DataFrame to save. sep (str): The delimiter used in the CSV file. dtypes (Optional[Dict[str, Union[str, np.dtype]]]): A dictionary specifying data types for columns. Returns: None Author: Travis Yeager (yeager7@llnl.gov) """ df = makedf(df) if dtypes: df = df.astype(dtypes) if exists(file_name): df.to_csv(file_name, mode='a', index=False, header=False, sep=sep) else: save_csv(file_name, df, sep=sep) return
_column_data = None
[docs] def exists_in_csv(csv_file: str, column: str, number: Union[int, float, str], sep: str = ',') -> bool: """ Checks if a number exists in a specific column of a CSV file. Args: csv_file (str): The path to the CSV file. column (str): The column name to search. number (Union[int, float, str]): The value to search for. sep (str, optional): The delimiter used in the CSV file. Default is comma (','). Returns: bool: True if the number exists in the column, False otherwise. Author: Travis Yeager (yeager7@llnl.gov) """ try: global _column_data if _column_data is None: _column_data = read_csv(csv_file, sep=sep, col=column, to_np=True) return np.isin(number, _column_data) except IOError: return False