Source code for task_geo.data_sources.covid.fr_covidata.fr_covidata

"""
fr_covidata.py

Functions:
    - fr_covidata_connector: Extracts data from CSV URL
    - fr_covidata_formatter: Cleans CSV data
    - fr_covidata: Combines the two previous functions

Data Credits:
    OpenCOVID19-fr
    https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/
    https://github.com/opencovid19-fr/data
"""

import io

import numpy as np
import pandas as pd
import requests

url = (
    'https://raw.githubusercontent.com/opencovid19-fr/'
    'data/master/dist/chiffres-cles.csv'
)


[docs]def fr_covidata(): """Data Source for the French COVID-19 Data. Arguments: None Returns: pandas.DataFrame """ df = fr_covidata_connector() return fr_covidata_formatter(df)
[docs]def fr_covidata_connector(): """Extract data from OpenCOVID19-fr's Github repository. Description: - Downloads the URL's data in a Unicode CSV Format - Unicode CSV Format: ACS 5Y UTF-8 Returns: dataset (DataFrame with CSV Data) """ urlData = requests.get(url).content dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8'))) return dataset
[docs]def fr_covidata_formatter(dataset): """Formatter for FR COVID-19 Data. Arguments: dataset(pandas.DataFrame): Data as returned by fr_covidata_connector. Description: - Drop unnecessary rows with irrelevant regions' info and only keep info related to subregions in Metropolitan France, as well as repetitive data - Check the dataset for instances where there are more than one source of data in the same subregion for the same date, then complement all the sources information, and take the highest value in case there are different values for the same column, while aggregating the sources info - Rename/Translate the column titles, and add a country column (France) Returns: frcovidata(pandas.DataFrame) """ no_gr = ['region', 'monde', 'pays', 'collectivite-outremer'] no_mc = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976'] dataset = dataset[ (~dataset.granularite.isin(no_gr)) & (~dataset.maille_code.isin(no_mc)) ] dataset = dataset.drop(['depistes', 'granularite'], axis=1) dataset = dataset.drop_duplicates( subset=['date', 'maille_code', 'cas_confirmes', 'deces', 'reanimation', 'hospitalises', 'gueris'], keep=False) dataset['date'] = pd.to_datetime(dataset['date'].astype(str)).dt.date # Reset indices: dataset = dataset.reset_index(drop=True) # Turn source columns' values type to string: str_columns = ['source_nom', 'source_url', 'source_archive', 'source_type'] dataset[str_columns] = dataset[str_columns].astype(str) aggre = { 'cas_confirmes': np.max, 'cas_ehpad': np.max, 'cas_confirmes_ehpad': np.max, 'cas_possibles_ehpad': np.max, 'deces': np.max, 'deces_ehpad': np.max, 'reanimation': np.max, 'hospitalises': np.max, 'gueris': np.max, 'source_nom': ','.join, 'source_url': ','.join, 'source_archive': ','.join, 'source_type': ','.join } dataset = dataset.groupby(['date', 'maille_code', 'maille_nom']).aggregate(aggre).reset_index() # Rename/Translate the column titles: dataset = dataset.rename( columns={"maille_code": "subregion_code", "maille_nom": "subregion_name", "cas_confirmes": "confirmed", "deces": "deaths", "reanimation": "recovering", "hospitalises": "hospitalized", "gueris": "recovered", "source_nom": "source_name"}) dataset['country'] = 'France' frcovidata = dataset[ 'subregion_code', 'subregion_name', 'country', 'date', 'confirmed', 'hospitalized', 'recovering', 'recovered', 'deaths', 'source_name', 'source_url', 'source_archive', 'source_type'] return frcovidata