Source code for gaiaxpy.file_parser.parse_generic

"""
parse_generic.py
====================================
Module to parse input files containing spectra.
"""

import os
import numpy as np
import pandas as pd
from astropy.table import Table
from astropy.io.votable import parse_single_table
from .cast import _cast
from gaiaxpy.core import array_to_symmetric_matrix

# Avoid warning, false positive
pd.options.mode.chained_assignment = None

valid_extensions = ['avro', 'csv', 'ecsv', 'fits', 'xml']


def _raise_key_error(column):
    raise KeyError(f'The columns in the input data do not match the expected ones. Missing column {column}.')


[docs]class DataMismatchError(RuntimeError): """ Error raised when the data in a file is invalid or the file extension does not match the file contents. """ def __init__(self): message = 'The file contains invalid data or the data does not match the file extension.' Exception.__init__(self, message)
[docs]class InvalidExtensionError(ValueError): """ Error raised when the extension of the input file is not valid. It inherits from ValueError. """ def __init__(self): valid = ', '.join(valid_extensions) message = f'Valid extensions are: {valid}.' Exception.__init__(self, message)
[docs]class GenericParser(object): """ Generic spectra parser. """
[docs] def get_parser(self, extension): """ Choose the parser to use based on the extension. Args: extension (str): File extension including the dot (e.g.: '.csv'). Returns: method: Parse method corresponding to the extension. Raises: InvalidExtensionError: If the extension is not valid. """ if extension == 'avro': return self._parse_avro elif extension == 'csv' or extension == 'ecsv': return self._parse_csv elif extension == 'fits': return self._parse_fits elif extension == 'xml': return self._parse_xml else: raise InvalidExtensionError()
[docs] def parse(self, file_path): """ Parse the input file according to its extension. Args: file_path (str): Path to a file. Returns: DataFrame: Pandas DataFrame representing the file. str: File extension ('.csv', '.fits', or '.xml'). """ extension = _get_file_extension(file_path) parser = self.get_parser(extension) parsed_data = _cast(parser(file_path)) return parsed_data, extension
def _parse_csv(self, csv_file, array_columns=None, matrix_columns=None): """ Parse the input CSV file and store the result in a pandas DataFrame. Args: csv_file (str): Path to a CSV file. array_columns (list): List of columns in the file that contain arrays as strings. matrix_columns (list of tuples): List of tuples where the first element is the number of rows/columns of a square matrix which values are those contained in the second element of the tuple. Returns: DataFrame: A pandas DataFrame representing the CSV file. """ converters = None if array_columns is not None: converters = dict([(column, lambda x: np.fromstring(x[1:-1], sep=',')) for column in array_columns]) try: df = pd.read_csv(csv_file, comment='#', float_precision='round_trip', converters=converters) except UnicodeDecodeError: raise DataMismatchError() if matrix_columns is not None: for index, row in df.iterrows(): for size_column, values_column in matrix_columns: try: df[values_column][index] = array_to_symmetric_matrix( df[size_column][index].astype(int), np.fromstring(row[values_column][1:-1], sep=',')) # Value can be NaN when a band is not present except TypeError: continue return df def _parse_fits(self, fits_file, array_columns=None, matrix_columns=None): """ Parse the input FITS file and store the result in a pandas DataFrame. Args: fits_file (str): Path to a FITS file. Returns: DataFrame: A pandas DataFrame representing the FITS file. """ try: data = Table.read(fits_file, format='fits') except OSError: raise DataMismatchError() fits_as_list = [] columns = data.columns.keys() for index, row in enumerate(data): # Append row values to list row = [] for column in columns: row.append(data[column][index]) fits_as_list.append(row) df = pd.DataFrame(fits_as_list, columns=columns) if array_columns is not None: for column in array_columns: for index, row in df.iterrows(): try: df[column][index] = row[column] # Value can be NaN when a band is not present except TypeError: continue if matrix_columns is not None: for index, row in df.iterrows(): for size_column, values_column in matrix_columns: try: df[values_column][index] = array_to_symmetric_matrix( df[size_column][index].astype(int), row[values_column]) # Value can be NaN when a band is not present except IndexError: continue return df def _parse_xml(self, xml_file, array_columns=None): """ Parse the input XML file and store the result in a pandas DataFrame. Args: xml_file (str): Path to an XML file. array_columns (list): List of columns in the file that contain arrays as strings. Returns: DataFrame: A pandas DataFrame representing the XML file. """ try: votable = parse_single_table( xml_file).to_table(use_names_over_ids=True) except ValueError: raise DataMismatchError() if array_columns: columns = list(votable.columns) votable_as_list = [] for index, row in enumerate(votable): # Append row values to list row = [] for column in columns[:-len(array_columns)]: row.append(votable[column][index]) # Remove mask for column in array_columns: try: row.append(votable[column][index].filled()) except KeyError: raise KeyError(f'The columns in the input data do not match the expected ones. Missing column {column}.') votable_as_list.append(row) return pd.DataFrame(votable_as_list, columns=columns) return votable.to_pandas()
def _get_file_extension(file_path): """ Get the extension of a file. Args: file_path (str): Path to a file. Returns: str: File extension (e.g.: '.csv') """ filename, file_extension = os.path.splitext(file_path) return file_extension[1:]