Source code for task_geo.data_sources.demographics.us_census.us_census

"""
us_census.py

Functions:
    - us_census_connector: Extracts data from CSV URL
    - us_census_formatter: Cleans CSV data
    - us_census: Combines the two previous functions

Data Credits:
    The United States Census Bureau
    https://data.census.gov/
"""

import urllib.request
import zipfile

import pandas as pd

url = 'https://data.census.gov/api/access/table/download?download_id=iuGrLXEBm-bIwvlxENnx'


[docs]def us_census(): """Data Source for the US census. Arguments: None Returns: pandas.DataFrame """ df = us_census_connector() return us_census_formatter(df)
[docs]def us_census_connector(): """Extract data from the US Census. Description: - Opens the zip file URL and extracts the correct CSV - Correct CSV: ACS 5Y Statistics Returns: data (DataFrame with CSV Data) """ urllib.request.urlretrieve(url, "uscensus.zip") with zipfile.ZipFile("uscensus.zip") as myzip: listFiles = myzip.namelist() myzip.extract(listFiles[5]) data = pd.read_csv(listFiles[5], low_memory=False) return data
[docs]def us_census_formatter(data): """Formatter for US Census. Arguments: data(pandas.DataFrame): Data as returned by us_census_connector. Description: - Drop unnecessary columns and set index to county - Make column values more readable Returns: pandas.DataFrame """ data.columns = data.iloc[0] data.drop(0, inplace=True) data.drop("id", axis=1, inplace=True) data = data.set_index('Geographic Area Name') cols = [c for c in data.columns if '2018' in c] data = data[cols] data.columns = [x.split("!!")[-1] for x in data.columns] data = data.replace("N", 0.0) data.columns = [x.lower() for x in data.columns] data.drop(data.columns[-1], axis=1, inplace=True) data.drop(data.columns[-1], axis=1, inplace=True) return data