Source code for gaiaxpy.file_parser.parse_generic

"""
parse_generic.py
====================================
Module to parse input files containing spectra.
"""
from os.path import splitext

import pandas as pd
from astropy.table import Table

from gaiaxpy.core.generic_functions import array_to_symmetric_matrix, str_to_array
from .cast import _cast

valid_extensions = ['avro', 'csv', 'ecsv', 'fits', 'xml']


def _raise_key_error(column):
    raise KeyError(f'The columns in the input data do not match the expected ones. Missing column {column}.')


[docs] class InvalidExtensionError(ValueError): """ Error raised when the extension of the input file is not valid. It inherits from ValueError. """ def __init__(self): valid = ', '.join(valid_extensions) message = f'Valid extensions are: {valid}.' Exception.__init__(self, message)
[docs] class GenericParser(object): """ Generic spectra parser. """ def __init__(self): self.info_msg = 'Reading input file...'
[docs] def get_parser(self, extension): """ Choose the parser to use based on the extension. Args: extension (str): File extension including the dot (e.g.: '.csv'). Returns: method: Parse method corresponding to the extension. Raises: InvalidExtensionError: If the extension is not valid. """ if extension == 'avro': return self._parse_avro elif extension in ['csv', 'ecsv']: return self._parse_csv elif extension == 'fits': return self._parse_fits elif extension == 'xml': return self._parse_xml else: raise InvalidExtensionError()
[docs] def parse_file(self, file_path, disable_info=False): """ Parse the input file according to its extension. Args: file_path (str): Path to a file. disable_info (bool): Whether to disable the progress tracker or not. Returns: DataFrame: Pandas DataFrame representing the file. str: File extension ('.csv', '.fits', or '.xml'). """ if not disable_info: self.print_info_msg() extension = _get_file_extension(file_path) parser = self.get_parser(extension) parsed_data = _cast(parser(file_path)) if not disable_info: self.print_info_msg(done=True) return parsed_data, extension
def _parse_avro(self, avro_file): raise NotImplementedError('Method not implemented for base class.') def _parse_csv(self, csv_file, _array_columns=None, _matrix_columns=None, _usecols=None): """ Parse the input CSV file and store the result in a pandas DataFrame. Args: csv_file (str): Path to a CSV file. _array_columns (list): List of columns in the file that contain arrays as strings. _matrix_columns (list of tuples): List of tuples where the first element is the number of rows/columns of a square matrix which values are those contained in the second element of the tuple. _usecols (list): Columns to read. Returns: DataFrame: A pandas DataFrame representing the CSV file. """ df = pd.read_csv(csv_file, comment='#', float_precision='round_trip', usecols=_usecols) if _array_columns: # Pandas converters seemed slower for column in _array_columns: if column in df.columns: df[column] = df[column].apply(lambda x: str_to_array(x)) if _matrix_columns: for size_column, values_column in _matrix_columns: df[values_column] = df.apply(lambda row: array_to_symmetric_matrix(str_to_array(row[values_column]), row[size_column]), axis=1) return df def _parse_fits(self, fits_file, _array_columns=None, _matrix_columns=None, _usecols=None): """ Parse the input FITS file and store the result in a pandas DataFrame. Args: fits_file (str): Path to a FITS file. _array_columns (list): List of columns in the file that contain arrays as strings. _matrix_columns (list of tuples): List of tuples where the first element is the number of rows/columns of a square matrix which values are those contained in the second element of the tuple. _usecols (list): Columns to read. Returns: DataFrame: A pandas DataFrame representing the FITS file. """ table = Table.read(fits_file, format='fits') df = table.to_pandas()[_usecols] if _usecols else table.to_pandas() if _matrix_columns: for size_column, values_column in _matrix_columns: df[values_column] = df.apply(lambda row: array_to_symmetric_matrix(row[values_column], row[size_column]), axis=1) return df def _parse_xml(self, xml_file, _array_columns=None, _matrix_columns=None, _usecols=None): """ Parse the input XML file and store the result in a pandas DataFrame. Args: xml_file (str): Path to an XML file. _array_columns (list): List of columns in the file that contain arrays as strings. _matrix_columns (list of tuples): List of tuples where the first element is the number of rows/columns of a square matrix which values are those contained in the second element of the tuple. _usecols (list): Columns to read. Returns: DataFrame: A pandas DataFrame representing the XML file. """ # Astropy won't automatically remove the columns that are not in _usecols, but it speeds up the process a bit table = Table.read(xml_file, columns=_usecols) # The table read by Astropy will still contain all the columns df = table.to_pandas()[_usecols] if _usecols else table.to_pandas() if _matrix_columns: for size_column, values_column in _matrix_columns: df[values_column] = df.apply(lambda row: array_to_symmetric_matrix(row[values_column], row[size_column]), axis=1) return df
[docs] def print_info_msg(self, done=False): msg = self.info_msg if done: msg = msg + ' Done!' print(msg, end='\r')
def _get_file_extension(file_path): """ Get the extension of a file. Args: file_path (str): Path to a file. Returns: str: File extension (e.g.: '.csv') """ _, file_extension = splitext(file_path) return file_extension[1:]