Data Testing#

This notebook contains some testing of our data sources to identify potential data coding errors in the main and best practice datasets.

We include the following groupings for testing

  • Smoke tests: These test that the data files exist and can be loaded for analysis. Nothing can run without passing these tests.

  • Testing data fields and subgroups: These test that the data is coded correctly and completely.

1. Imports#

1.1. Standard Imports#

import pandas as pd
import numpy as np

# used in smoke tests
import requests

1.2 Testing imports#

import ipytest
import pytest
ipytest.autoconfig()

1.2 Imports from preprocessing module#

# function for loading full dataset
from preprocessing import load_clean_dataset, load_clean_bpa

2. Constants#

FILE_NAME_MAIN = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'

FILE_NAME_BPA = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/bp_audit.zip'

3. Functions#

def num_covid_in_year(df, year):
    '''
    Return the integer number of studies with a Covid-19 DES model
    in a specified publication year.

    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    year: int
        Year of publication
        
    Returns:
    -------
    int
    '''
    return len(df[(df['pub_yr']==year) & (df['covid'] == 1)])
def num_included_in_year(df, year):
    '''
    Return the integer number of studies included
    in a specified publication year.

    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    year: int
        Year of publication
        
    Returns:
    -------
    int
    '''
    return len(df[(df['pub_yr']==year) & (df['study_included'] == 1)])
def nan_fields(df, field):
    '''
    Return the dataframe containing all rows
    where study_included is NaN (empty)
    
    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    field: str
        The name of the field to check for NaNs
        
        
    Returns:
    -------
    int
    '''
    return len(df[(df[field].isnull()) & (df['study_included'] == 1)])

4. Smoke Testing#

4.1. Data Files Exist#

  • First check if the data files exist at the specified URLs.

%%ipytest
@pytest.mark.parametrize('url', [
    (FILE_NAME_MAIN),
    (FILE_NAME_BPA),
]) 
def test_file_exists_at_url(url):
    '''
    Request data file from URL. Test is status code = 200 (exists)
    
    Params:
    ------
    url: str
        URL to test
    '''
    r = requests.head(url, allow_redirects=True)
    # if code = 200 then resource exists.
    assert r.status_code == 200
..                                                                                           [100%]
2 passed in 0.55s

4.2 Preprocessing logic returns DataFrame containing data#

  • Read in check for any data and that it loads into a DataFrame

%%ipytest

@pytest.mark.parametrize('url, func', [
    (FILE_NAME_MAIN, load_clean_dataset),
    (FILE_NAME_BPA, load_clean_bpa),
]) 
def test_load_dataframe(url, func):
    '''
    Test that the preprocessing function can return a populated dataframe
    for the specified URL.
    
    tested by checking df has at least 1 row.
    
    Params:
    ------
    url: str
        URL for file
        
    func: object
        Python function containing preprocessing logic. Assume that it returns
        a dataframe.
    '''
    df = func(url)
    assert len(df) > 0
..                                                                                           [100%]
2 passed in 0.61s

5. Read in data#

clean = load_clean_dataset(FILE_NAME_MAIN)
clean_bpa = load_clean_bpa(FILE_NAME_BPA)

6. Testing data fields and subgroups#

6.1 Check coding for Covid-19 models#

  • There should be no Covid-19 DES models pre-2020.

%%ipytest

@pytest.mark.parametrize('df, year, expected', [
    (clean, 2019, 0),
]) 
def test_num_covid_in_year(df, year, expected):
    assert num_covid_in_year(df, year) == expected
    
.                                                                                            [100%]
1 passed in 0.01s

6.2 Studies included#

  • Test each year has at least 1 study included.

  • Test that there are no studies where the study included field is set to null.

%%ipytest

@pytest.mark.parametrize('df, year', [
    (clean, 2019),
    (clean, 2020),
    (clean, 2021),
    (clean, 2022),
]) 
def test_studies_included_in_year(df, year):
    '''
    Test the number of studies in a particular year
    of a subgroup is at least 1.
    '''
    assert num_included_in_year(df, year) >= 1
    

    
def test_all_studies_considered():
    '''
    Test that all studies have been coded as 0 or 1
    '''
    assert len(clean[clean['study_included'].isnull()]) == 0
.....                                                                                        [100%]
5 passed in 0.01s

6.3. NaN fields in mandatory data extraction fields#

  • Test that mandatory fields contain no NaN.

%%ipytest

@pytest.mark.parametrize('df, field', [
    (clean, 'study_included'),
    (clean, 'covid'),
    (clean, 'foss_sim'),
    (clean, 'reporting_guidelines_mention'),
    (clean, 'sim_software'),
]) 
def test_no_nan_fields_included_studies(df, field):
    '''
    Test that a selected field contains no NaN data.
    '''
    assert nan_fields(df, field) == 0
.....                                                                                        [100%]
5 passed in 0.01s

6.4 Binary fields only contain 0 or 1#

  • covid and mode_code_available fields are mandatory 0 or 1.

def non_zero_one_coding(df, field):
    return len(df[(df[field] != 1) & (df[field] != 0) & 
                  (df['study_included'] == 1)])
%%ipytest

@pytest.mark.parametrize('df, field', [
    (clean, 'covid'),
    (clean, 'model_code_available'),
]) 
def test_zero_one_coding(df, field):
    assert non_zero_one_coding(df, field) == 0
..                                                                                           [100%]
2 passed in 0.01s