Module `spatial_inequality.auxiliary.data_handler`

Handler for all direct I/O with raw data files. Serves data parsing, filtering and standardization (for use with other modules).

Expand source code

"""
Handler for all direct I/O with raw data files. Serves data parsing, filtering
and standardization (for use with other modules).
"""
import numpy as np
import pandas as pd

def fix_ncesid(ncesid, mode):
    """
    Applies standard formatting (zero padding and typecasting) to
    both schools' and districts' NCES IDs.
  
    Args:
        ncesid (int): Target NCES ID to fix (e.g. 100005).
        mode (str): Should be either "school" or "district".
    
    Returns:
        str: Standardized NCES ID (does not perform zero padding if
            unknown mode is porvided).
    """
    padding = {
        "school": 12,
        "district": 7
    }.get(mode, 0)
    return str(ncesid).zfill(padding)

def remove_cross_state_neighbors(aug_school_info, school_assignment):
    """
    Removes any neighbouring edge between schools in different states.
  
    Args:
        aug_school_info (pandas.DataFrame): Target school information
            (containing neighbouring edges).
        school_assignment (pandas.DataFrame): Target school assignment
            (containing state assignment).
    
    Returns:
        pandas.DataFrame: Refined school information (with cross-state
            neighbours removed).
    """
    aug_school_info = aug_school_info.copy()

    # Filter out cross-state neighbors
    get_state = lambda school_id: school_assignment.loc[school_id]["state_name"]
    from_same_state = lambda id_l, id_r: get_state(id_l) == get_state(id_r)

    def filter_cross_state_neighbors(row):
        if len(row["neighbour_ids"]) == 0:
            return row
        neighbor_ids_set = set(row["neighbour_ids"].split(","))
        valid_neighbor_ids_set = set(filter(
            lambda neighbor_id: from_same_state(row.name, neighbor_id),
            neighbor_ids_set
        ))
        row["neighbour_ids"] = ",".join(list(valid_neighbor_ids_set))
        return row
        
    aug_school_info = aug_school_info.apply(
        filter_cross_state_neighbors,
        axis=1
    )

    return aug_school_info

def remove_invalid_entries(aug_school_info, school_assignment):
    """
    Removes any school entries which do not contain information regarding their
    total funding and/or number students. 
  
    Args:
        aug_school_info (pandas.DataFrame): Target school information
            (containing funding and students).
        school_assignment (pandas.DataFrame): Target school assignment
            (containing district and state assignments).
    
    Returns:
        tuple: Refined school information (`pandas.DataFrame`) and school
            assignment (`pandas.DataFrame`).
    """
    aug_school_info = aug_school_info.copy()
    school_assignment = school_assignment.copy()

    # Remove schools without funding or students
    replace_zero_with_nan = lambda x: x.replace(0, np.nan, inplace=True)
    replace_zero_with_nan(aug_school_info["adjusted_total_revenue_per_student"])
    replace_zero_with_nan(aug_school_info["total_students"])
    aug_school_info = aug_school_info.dropna(subset=[
        "adjusted_total_revenue_per_student",
        "total_students"
    ])

    # Set of all schools w/ available info
    valid_school_ids_set = set(aug_school_info.index)

    # Filter out neighbors wo/ info
    def filter_unavailable_neighbors(neighbor_ids_str, valid_school_ids_set):
        if len(neighbor_ids_str) == 0:
            return []
        neighbor_ids_set = set(neighbor_ids_str.split(","))
        valid_neighbor_ids_set = neighbor_ids_set.intersection(valid_school_ids_set)
        return ",".join(list(valid_neighbor_ids_set))

    aug_school_info["neighbour_ids"] = aug_school_info["neighbour_ids"].apply(
        lambda neighbor_ids_str: filter_unavailable_neighbors(neighbor_ids_str, valid_school_ids_set)
    )

    # Filter out school assignments wo/ info
    school_assignment = school_assignment[school_assignment.index.isin(aug_school_info.index)]

    return aug_school_info, school_assignment

def load_data():
    """
    Loads all necessary data regarding school information and school assignment. 
  
    Returns:
        tuple: School information (`pandas.DataFrame`) and school assignment
            (`pandas.DataFrame`).
    """
    dh = DataHandler()
    aug_school_info = dh.get_augmented_school_info()
    school_assignment = dh.get_school_assignment()

    aug_school_info, school_assignment = remove_invalid_entries(aug_school_info, school_assignment)
    aug_school_info = remove_cross_state_neighbors(aug_school_info, school_assignment)
    return aug_school_info, school_assignment

class DataHandler:
    """
    Class to handle initial data parsing, reading and formatting.

    Attributes:
        __data_path (str): Root directory from which to read data files.
    """
    __data_path = None
    
    def __init__(self, data_path="../data"):
        self.__data_path = data_path
        
    def __read_file(self, filename, compression="gzip", encoding="utf-8"):
        """
        Read raw CSV data file, located inside `DataHandler.__data_path`.
      
        Args:
            filename (str): Name of file to read (w/ extension).
            compression (str): Compression algorithm used in the CSV file.
            encoding (str): Character encoding used in the CSV file.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame.
        """
        return pd.read_csv(
            f"{self.__data_path}/{filename}",
            compression=compression,
            encoding=encoding
        )
    
    def __format_cols(self, df, type_dict):
        """
        Formats all columns' values of a DataFrame with their desired
        variable type.
      
        Args:
            df (pandas.DataFrame): Target DataFrame.
            type_dict (dict): Type mapping for all named columns.
        
        Returns:
            pandas.DataFrame: Copy of initial DataFrame with typecasted column
                values.
        """
        formatted_df = df.copy()
        for col_name, dtype in type_dict.items():
            formatted_df[col_name] = formatted_df[col_name].astype(dtype)
        return formatted_df
    
    def get_augmented_school_info(self):
        """
        Augments school information to also include estimated 'per-student
        revenue', alongside all other school attributes.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame.
        """
        school_info = self.get_school_info()
        district_info = self.get_district_info()
        school_assignment = self.get_school_assignment()
        # Add 'per student revenue' to each school's attributes
        school_info = pd.merge(school_info, school_assignment[["district_id"]], left_index=True, right_index=True)
        school_info = pd.merge(school_info, district_info, left_on="district_id", right_index=True)
        return school_info.drop(["district_id"], axis=1)
    
    def get_school_info(self):
        """
        Read school information from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
        """
        filename = "school_info.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
        # Remove invalid schools (schools that have NaN values and do
        # not show up in 'https://nces.ed.gov/ccd/schoolsearch/').
        df = df.dropna()
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "school_id": str,
                "neighbour_ids": str,
                "total_students": int
            }
        )
        # Return final dataframe
        return df.set_index("school_id", drop=True)
    
    def get_district_info(self):
        """
        Read district information from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "district_id").
        """
        filename = "district_info.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "district_id": str,
                "adjusted_local_revenue_per_student": float,
                "adjusted_state_revenue_per_student": float,
                "adjusted_federal_revenue_per_student": float,
                "adjusted_total_revenue_per_student": float,
            }
        )
        # Return final dataframe
        return df.set_index("district_id", drop=True)
    
    def get_school_assignment(self):
        """
        Read initial school assignment from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
        """
        filename = "school_assignment.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
        df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "school_id": str,
                "district_id": str,
                "state_name": str
            }
        )
        # Return final dataframe
        return df.set_index("school_id", drop=True)

Functions

def fix_ncesid(ncesid, mode)

Applies standard formatting (zero padding and typecasting) to both schools' and districts' NCES IDs.

Args

ncesid : int: Target NCES ID to fix (e.g. 100005).
mode : str: Should be either "school" or "district".

Returns

str: Standardized NCES ID (does not perform zero padding if unknown mode is porvided).

Expand source code

def fix_ncesid(ncesid, mode):
    """
    Applies standard formatting (zero padding and typecasting) to
    both schools' and districts' NCES IDs.
  
    Args:
        ncesid (int): Target NCES ID to fix (e.g. 100005).
        mode (str): Should be either "school" or "district".
    
    Returns:
        str: Standardized NCES ID (does not perform zero padding if
            unknown mode is porvided).
    """
    padding = {
        "school": 12,
        "district": 7
    }.get(mode, 0)
    return str(ncesid).zfill(padding)

def load_data()

Loads all necessary data regarding school information and school assignment.

Returns

tuple: School information (pandas.DataFrame) and school assignment (pandas.DataFrame).

Expand source code

def load_data():
    """
    Loads all necessary data regarding school information and school assignment. 
  
    Returns:
        tuple: School information (`pandas.DataFrame`) and school assignment
            (`pandas.DataFrame`).
    """
    dh = DataHandler()
    aug_school_info = dh.get_augmented_school_info()
    school_assignment = dh.get_school_assignment()

    aug_school_info, school_assignment = remove_invalid_entries(aug_school_info, school_assignment)
    aug_school_info = remove_cross_state_neighbors(aug_school_info, school_assignment)
    return aug_school_info, school_assignment

def remove_cross_state_neighbors(aug_school_info, school_assignment)

Removes any neighbouring edge between schools in different states.

Args

aug_school_info : pandas.DataFrame: Target school information (containing neighbouring edges).
school_assignment : pandas.DataFrame: Target school assignment (containing state assignment).

Returns

pandas.DataFrame: Refined school information (with cross-state neighbours removed).

Expand source code

def remove_cross_state_neighbors(aug_school_info, school_assignment):
    """
    Removes any neighbouring edge between schools in different states.
  
    Args:
        aug_school_info (pandas.DataFrame): Target school information
            (containing neighbouring edges).
        school_assignment (pandas.DataFrame): Target school assignment
            (containing state assignment).
    
    Returns:
        pandas.DataFrame: Refined school information (with cross-state
            neighbours removed).
    """
    aug_school_info = aug_school_info.copy()

    # Filter out cross-state neighbors
    get_state = lambda school_id: school_assignment.loc[school_id]["state_name"]
    from_same_state = lambda id_l, id_r: get_state(id_l) == get_state(id_r)

    def filter_cross_state_neighbors(row):
        if len(row["neighbour_ids"]) == 0:
            return row
        neighbor_ids_set = set(row["neighbour_ids"].split(","))
        valid_neighbor_ids_set = set(filter(
            lambda neighbor_id: from_same_state(row.name, neighbor_id),
            neighbor_ids_set
        ))
        row["neighbour_ids"] = ",".join(list(valid_neighbor_ids_set))
        return row
        
    aug_school_info = aug_school_info.apply(
        filter_cross_state_neighbors,
        axis=1
    )

    return aug_school_info

def remove_invalid_entries(aug_school_info, school_assignment)

Removes any school entries which do not contain information regarding their total funding and/or number students.

Args

aug_school_info : pandas.DataFrame: Target school information (containing funding and students).
school_assignment : pandas.DataFrame: Target school assignment (containing district and state assignments).

Returns

tuple: Refined school information (pandas.DataFrame) and school assignment (pandas.DataFrame).

Expand source code

def remove_invalid_entries(aug_school_info, school_assignment):
    """
    Removes any school entries which do not contain information regarding their
    total funding and/or number students. 
  
    Args:
        aug_school_info (pandas.DataFrame): Target school information
            (containing funding and students).
        school_assignment (pandas.DataFrame): Target school assignment
            (containing district and state assignments).
    
    Returns:
        tuple: Refined school information (`pandas.DataFrame`) and school
            assignment (`pandas.DataFrame`).
    """
    aug_school_info = aug_school_info.copy()
    school_assignment = school_assignment.copy()

    # Remove schools without funding or students
    replace_zero_with_nan = lambda x: x.replace(0, np.nan, inplace=True)
    replace_zero_with_nan(aug_school_info["adjusted_total_revenue_per_student"])
    replace_zero_with_nan(aug_school_info["total_students"])
    aug_school_info = aug_school_info.dropna(subset=[
        "adjusted_total_revenue_per_student",
        "total_students"
    ])

    # Set of all schools w/ available info
    valid_school_ids_set = set(aug_school_info.index)

    # Filter out neighbors wo/ info
    def filter_unavailable_neighbors(neighbor_ids_str, valid_school_ids_set):
        if len(neighbor_ids_str) == 0:
            return []
        neighbor_ids_set = set(neighbor_ids_str.split(","))
        valid_neighbor_ids_set = neighbor_ids_set.intersection(valid_school_ids_set)
        return ",".join(list(valid_neighbor_ids_set))

    aug_school_info["neighbour_ids"] = aug_school_info["neighbour_ids"].apply(
        lambda neighbor_ids_str: filter_unavailable_neighbors(neighbor_ids_str, valid_school_ids_set)
    )

    # Filter out school assignments wo/ info
    school_assignment = school_assignment[school_assignment.index.isin(aug_school_info.index)]

    return aug_school_info, school_assignment

Classes

class DataHandler (data_path='../data')

Class to handle initial data parsing, reading and formatting.

Attributes

__data_path : str: Root directory from which to read data files.

Expand source code

class DataHandler:
    """
    Class to handle initial data parsing, reading and formatting.

    Attributes:
        __data_path (str): Root directory from which to read data files.
    """
    __data_path = None
    
    def __init__(self, data_path="../data"):
        self.__data_path = data_path
        
    def __read_file(self, filename, compression="gzip", encoding="utf-8"):
        """
        Read raw CSV data file, located inside `DataHandler.__data_path`.
      
        Args:
            filename (str): Name of file to read (w/ extension).
            compression (str): Compression algorithm used in the CSV file.
            encoding (str): Character encoding used in the CSV file.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame.
        """
        return pd.read_csv(
            f"{self.__data_path}/{filename}",
            compression=compression,
            encoding=encoding
        )
    
    def __format_cols(self, df, type_dict):
        """
        Formats all columns' values of a DataFrame with their desired
        variable type.
      
        Args:
            df (pandas.DataFrame): Target DataFrame.
            type_dict (dict): Type mapping for all named columns.
        
        Returns:
            pandas.DataFrame: Copy of initial DataFrame with typecasted column
                values.
        """
        formatted_df = df.copy()
        for col_name, dtype in type_dict.items():
            formatted_df[col_name] = formatted_df[col_name].astype(dtype)
        return formatted_df
    
    def get_augmented_school_info(self):
        """
        Augments school information to also include estimated 'per-student
        revenue', alongside all other school attributes.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame.
        """
        school_info = self.get_school_info()
        district_info = self.get_district_info()
        school_assignment = self.get_school_assignment()
        # Add 'per student revenue' to each school's attributes
        school_info = pd.merge(school_info, school_assignment[["district_id"]], left_index=True, right_index=True)
        school_info = pd.merge(school_info, district_info, left_on="district_id", right_index=True)
        return school_info.drop(["district_id"], axis=1)
    
    def get_school_info(self):
        """
        Read school information from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
        """
        filename = "school_info.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
        # Remove invalid schools (schools that have NaN values and do
        # not show up in 'https://nces.ed.gov/ccd/schoolsearch/').
        df = df.dropna()
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "school_id": str,
                "neighbour_ids": str,
                "total_students": int
            }
        )
        # Return final dataframe
        return df.set_index("school_id", drop=True)
    
    def get_district_info(self):
        """
        Read district information from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "district_id").
        """
        filename = "district_info.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "district_id": str,
                "adjusted_local_revenue_per_student": float,
                "adjusted_state_revenue_per_student": float,
                "adjusted_federal_revenue_per_student": float,
                "adjusted_total_revenue_per_student": float,
            }
        )
        # Return final dataframe
        return df.set_index("district_id", drop=True)
    
    def get_school_assignment(self):
        """
        Read initial school assignment from a CSV file into a DataFrame.
        
        Returns:
            pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
        """
        filename = "school_assignment.csv"
        df = self.__read_file(filename)
        # Fix NCESID
        df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
        df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
        # Assign type to each column
        df = self.__format_cols(df, type_dict={
                "school_id": str,
                "district_id": str,
                "state_name": str
            }
        )
        # Return final dataframe
        return df.set_index("school_id", drop=True)

Methods

def get_augmented_school_info(self)

Augments school information to also include estimated 'per-student revenue', alongside all other school attributes.

Returns

pandas.DataFrame: Resulting DataFrame.

Expand source code

def get_augmented_school_info(self):
    """
    Augments school information to also include estimated 'per-student
    revenue', alongside all other school attributes.
    
    Returns:
        pandas.DataFrame: Resulting DataFrame.
    """
    school_info = self.get_school_info()
    district_info = self.get_district_info()
    school_assignment = self.get_school_assignment()
    # Add 'per student revenue' to each school's attributes
    school_info = pd.merge(school_info, school_assignment[["district_id"]], left_index=True, right_index=True)
    school_info = pd.merge(school_info, district_info, left_on="district_id", right_index=True)
    return school_info.drop(["district_id"], axis=1)

def get_district_info(self)

Read district information from a CSV file into a DataFrame.

Returns

pandas.DataFrame: Resulting DataFrame (indexed on "district_id").

Expand source code

def get_district_info(self):
    """
    Read district information from a CSV file into a DataFrame.
    
    Returns:
        pandas.DataFrame: Resulting DataFrame (indexed on "district_id").
    """
    filename = "district_info.csv"
    df = self.__read_file(filename)
    # Fix NCESID
    df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
    # Assign type to each column
    df = self.__format_cols(df, type_dict={
            "district_id": str,
            "adjusted_local_revenue_per_student": float,
            "adjusted_state_revenue_per_student": float,
            "adjusted_federal_revenue_per_student": float,
            "adjusted_total_revenue_per_student": float,
        }
    )
    # Return final dataframe
    return df.set_index("district_id", drop=True)

def get_school_assignment(self)

Read initial school assignment from a CSV file into a DataFrame.

Returns

pandas.DataFrame: Resulting DataFrame (indexed on "school_id").

Expand source code

def get_school_assignment(self):
    """
    Read initial school assignment from a CSV file into a DataFrame.
    
    Returns:
        pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
    """
    filename = "school_assignment.csv"
    df = self.__read_file(filename)
    # Fix NCESID
    df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
    df["district_id"] = df["district_id"].apply(lambda x: fix_ncesid(x, mode="district"))
    # Assign type to each column
    df = self.__format_cols(df, type_dict={
            "school_id": str,
            "district_id": str,
            "state_name": str
        }
    )
    # Return final dataframe
    return df.set_index("school_id", drop=True)

def get_school_info(self)

Read school information from a CSV file into a DataFrame.

Returns

pandas.DataFrame: Resulting DataFrame (indexed on "school_id").

Expand source code

def get_school_info(self):
    """
    Read school information from a CSV file into a DataFrame.
    
    Returns:
        pandas.DataFrame: Resulting DataFrame (indexed on "school_id").
    """
    filename = "school_info.csv"
    df = self.__read_file(filename)
    # Fix NCESID
    df["school_id"] = df["school_id"].apply(lambda x: fix_ncesid(x, mode="school"))
    # Remove invalid schools (schools that have NaN values and do
    # not show up in 'https://nces.ed.gov/ccd/schoolsearch/').
    df = df.dropna()
    # Assign type to each column
    df = self.__format_cols(df, type_dict={
            "school_id": str,
            "neighbour_ids": str,
            "total_students": int
        }
    )
    # Return final dataframe
    return df.set_index("school_id", drop=True)