Source code for task_geo.data_sources.covid.fr_covidata.fr_covidata

"""
fr_covidata.py

Functions:
    - fr_covidata_connector: Extracts data from CSV URL
    - fr_covidata_formatter: Cleans CSV data
    - fr_covidata: Combines the two previous functions

Data Credits:
    OpenCOVID19-fr
    https://www.data.gouv.fr/en/datasets/chiffres-cles-concernant-lepidemie-de-covid19-en-france/
    https://github.com/opencovid19-fr/data
"""

import io

import numpy as np
import pandas as pd
import requests

url = (
    'https://raw.githubusercontent.com/opencovid19-fr/'
    'data/master/dist/chiffres-cles.csv'
)


[docs]def fr_covidata():
    """Data Source for the French COVID-19 Data.
    Arguments:
        None
    Returns:
        pandas.DataFrame
    """
    df = fr_covidata_connector()
    return fr_covidata_formatter(df)


[docs]def fr_covidata_connector():
    """Extract data from OpenCOVID19-fr's Github repository.
    Description:
        - Downloads the URL's data in a Unicode CSV Format
        - Unicode CSV Format: ACS 5Y UTF-8
    Returns:
        dataset (DataFrame with CSV Data)
    """

    urlData = requests.get(url).content

    dataset = pd.read_csv(io.StringIO(urlData.decode('utf-8')))
    return dataset


[docs]def fr_covidata_formatter(dataset):
    """Formatter for FR COVID-19 Data.
    Arguments:
        dataset(pandas.DataFrame): Data as returned by fr_covidata_connector.
    Description:
        - Drop unnecessary rows with irrelevant regions' info and only keep
        info related to subregions in Metropolitan France, as well as
        repetitive data
        - Check the dataset for instances where there are more than one source
        of data in the same subregion for the same date, then complement all
        the sources information, and take the highest value in case there are
        different values for the same column, while aggregating the sources
        info
        - Rename/Translate the column titles, and add a country column (France)
    Returns:
        frcovidata(pandas.DataFrame)
    """

    no_gr = ['region', 'monde', 'pays', 'collectivite-outremer']
    no_mc = ['DEP-971', 'DEP-972', 'DEP-973', 'DEP-974', 'DEP-976']
    dataset = dataset[
        (~dataset.granularite.isin(no_gr)) & (~dataset.maille_code.isin(no_mc))
    ]
    dataset = dataset.drop(['depistes', 'granularite'], axis=1)
    dataset = dataset.drop_duplicates(
        subset=['date', 'maille_code', 'cas_confirmes', 'deces',
                'reanimation',
                'hospitalises', 'gueris'], keep=False)
    dataset['date'] = pd.to_datetime(dataset['date'].astype(str)).dt.date

    # Reset indices:
    dataset = dataset.reset_index(drop=True)

    # Turn source columns' values type to string:
    str_columns = ['source_nom', 'source_url',
                   'source_archive', 'source_type']
    dataset[str_columns] = dataset[str_columns].astype(str)

    aggre = {
        'cas_confirmes': np.max,
        'cas_ehpad': np.max,
        'cas_confirmes_ehpad': np.max,
        'cas_possibles_ehpad': np.max,
        'deces': np.max,
        'deces_ehpad': np.max,
        'reanimation': np.max,
        'hospitalises': np.max,
        'gueris': np.max,
        'source_nom': ','.join,
        'source_url': ','.join,
        'source_archive': ','.join,
        'source_type': ','.join
    }
    dataset = dataset.groupby(['date',
                               'maille_code',
                               'maille_nom']).aggregate(aggre).reset_index()

    # Rename/Translate the column titles:
    dataset = dataset.rename(
        columns={"maille_code": "subregion_code",
                 "maille_nom": "subregion_name", "cas_confirmes": "confirmed",
                 "deces": "deaths", "reanimation": "recovering",
                 "hospitalises": "hospitalized", "gueris": "recovered",
                 "source_nom": "source_name"})
    dataset['country'] = 'France'
    frcovidata = dataset[
        'subregion_code', 'subregion_name', 'country', 'date', 'confirmed',
        'hospitalized', 'recovering', 'recovered',
        'deaths', 'source_name', 'source_url', 'source_archive',
        'source_type']

    return frcovidata