Source code for gaiaxpy.file_parser.parse_internal_continuous

"""
parse_internal_continuous.py
====================================
Module to parse input files containing internally calibrated continuous spectra.
"""

import numpy as np
import pandas as pd
import re
from fastavro import reader
from astropy.io.votable import parse_single_table
from .parse_generic import GenericParser
from .utils import _csv_to_avro_map, _get_from_dict
from gaiaxpy.core import array_to_symmetric_matrix
from gaiaxpy.file_parser import DataMismatchError
from .cast import _cast

# Avoid warning, false positive
pd.options.mode.chained_assignment = None

# Columns that contain arrays (as strings)
array_columns = [
    'bp_coefficients',
    'bp_coefficient_errors',
    'rp_coefficients',
    'rp_coefficient_errors']
# Pairs of the form (matrix_size (N), values_to_put_in_matrix) for columns
# that contain matrices as strings
matrix_columns = [('bp_n_parameters', 'bp_coefficient_correlations'),
                  ('rp_n_parameters', 'rp_coefficient_correlations')]


[docs]class InternalContinuousParser(GenericParser): """ Parser for internally calibrated continuous spectra. """ def _parse_csv(self, csv_file): """ Parse the input CSV file and store the result in a pandas DataFrame if it contains internally calibrated continuous spectra. Args: csv_file (str): Path to a CSV file. Returns: DataFrame: Pandas DataFrame representing the CSV file. """ return super()._parse_csv( csv_file, array_columns=array_columns, matrix_columns=matrix_columns) def _parse_fits(self, fits_file): """ Parse the input FITS file and store the result in a pandas DataFrame if it contains internally calibrated continuous spectra. Args: csv_file (str): Path to a FITS file. Returns: DataFrame: Pandas DataFrame representing the FITS file. """ return super()._parse_fits( fits_file, array_columns=array_columns, matrix_columns=matrix_columns) def _parse_xml(self, xml_file): """ Parse the input XML file and store the result in a pandas DataFrame. Args: xml_file (str): Path to an XML file. array_columns (list): List of columns in the file that contain arrays as strings. Returns: DataFrame: A pandas DataFrame representing the XML file. """ try: votable = parse_single_table(xml_file) except ValueError: raise DataMismatchError() columns = [re.search('<FIELD ID="(.+?)"', str(field)).group(1) for field in votable.fields] values_to_df = [] for index, entry in enumerate(votable.array): row = [] for column, value in zip(columns, entry): row.append(value) values_to_df.append(row) df = pd.DataFrame(values_to_df, columns=columns) if matrix_columns is not None: for index, row in df.iterrows(): for size_column, values_column in matrix_columns: try: df[values_column][index] = array_to_symmetric_matrix( df[size_column][index].astype(int), row[values_column]) # Value can be NaN when a band is not present except IndexError: continue return _cast(df) def _parse_avro(self, avro_file): """ Parse the input AVRO file and return the result as a Pandas DataFrame. Args: avro_file (str): Path to an AVRO file. Returns: DataFrame: Pandas DataFrame representing the AVRO file. """ def _avro_file_to_df(avro_file): records_list = [] f = open(avro_file, 'rb') avro_reader = reader(f) record = avro_reader.next() while record: try: current_record = {} for key in _csv_to_avro_map.keys(): # Access the record and get the value corresponding to the key # If the record is a list, it must be converted to numpy array value = _get_from_dict(record, _csv_to_avro_map[key]) if isinstance(value, list): value = np.array(value) # Add this value to a dictionary which represents the current # record current_record[key] = value # Append this record to the global list records_list.append(current_record) except KeyError: raise KeyError("Keys in the input file don't match the expected ones.") try: # Move onto the next record record = avro_reader.next() except StopIteration: # Reached end on file, close it and break f.close() break # Records to DataFrame return pd.DataFrame(records_list) # Pairs of the form (matrix_size (N), values_to_put_in_matrix) for columns # that contain matrices as strings to_matrix_columns = [('bp_n_parameters', 'bp_coefficient_covariances'), ('rp_n_parameters', 'rp_coefficient_covariances')] df = _avro_file_to_df(avro_file) for index, row in df.iterrows(): for size_column, values_column in to_matrix_columns: try: df[values_column][index] = array_to_symmetric_matrix( df[size_column][index].astype(int), row[values_column]) # Value can be NaN when a band is not present except TypeError: continue return _cast(df)