Data Testing#

This notebook contains some testing of our data sources to identify potential data coding errors in the main and best practice datasets.

We include the following groupings for testing

Smoke tests: These test that the data files exist and can be loaded for analysis. Nothing can run without passing these tests.
Testing data fields and subgroups: These test that the data is coded correctly and completely.

1. Imports#

1.1. Standard Imports#

import pandas as pd
import numpy as np

# used in smoke tests
import requests

1.2 Testing imports#

import ipytest
import pytest
ipytest.autoconfig()

1.2 Imports from preprocessing module#

# function for loading full dataset
from preprocessing import load_clean_dataset, load_clean_bpa

2. Constants#

FILE_NAME_MAIN = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'

FILE_NAME_BPA = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/bp_audit.zip'

3. Functions#

def num_covid_in_year(df, year):
    '''
    Return the integer number of studies with a Covid-19 DES model
    in a specified publication year.

    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    year: int
        Year of publication
        
    Returns:
    -------
    int
    '''
    return len(df[(df['pub_yr']==year) & (df['covid'] == 1)])

def num_included_in_year(df, year):
    '''
    Return the integer number of studies included
    in a specified publication year.

    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    year: int
        Year of publication
        
    Returns:
    -------
    int
    '''
    return len(df[(df['pub_yr']==year) & (df['study_included'] == 1)])

def nan_fields(df, field):
    '''
    Return the dataframe containing all rows
    where study_included is NaN (empty)
    
    Params:
    -----
    df: pd.DataFrame
        Subgroup dataset
        
    field: str
        The name of the field to check for NaNs
        
        
    Returns:
    -------
    int
    '''
    return len(df[(df[field].isnull()) & (df['study_included'] == 1)])

4. Smoke Testing#

4.1. Data Files Exist#

First check if the data files exist at the specified URLs.

%%ipytest
@pytest.mark.parametrize('url', [
    (FILE_NAME_MAIN),
    (FILE_NAME_BPA),
]) 
def test_file_exists_at_url(url):
    '''
    Request data file from URL. Test is status code = 200 (exists)
    
    Params:
    ------
    url: str
        URL to test
    '''
    r = requests.head(url, allow_redirects=True)
    # if code = 200 then resource exists.
    assert r.status_code == 200

..                                                                                           [100%]
2 passed in 0.55s

4.2 Preprocessing logic returns DataFrame containing data#

Read in check for any data and that it loads into a DataFrame

%%ipytest

@pytest.mark.parametrize('url, func', [
    (FILE_NAME_MAIN, load_clean_dataset),
    (FILE_NAME_BPA, load_clean_bpa),
]) 
def test_load_dataframe(url, func):
    '''
    Test that the preprocessing function can return a populated dataframe
    for the specified URL.
    
    tested by checking df has at least 1 row.
    
    Params:
    ------
    url: str
        URL for file
        
    func: object
        Python function containing preprocessing logic. Assume that it returns
        a dataframe.
    '''
    df = func(url)
    assert len(df) > 0

..                                                                                           [100%]
2 passed in 0.61s

5. Read in data#

clean = load_clean_dataset(FILE_NAME_MAIN)
clean_bpa = load_clean_bpa(FILE_NAME_BPA)

6. Testing data fields and subgroups#

6.1 Check coding for Covid-19 models#

There should be no Covid-19 DES models pre-2020.

%%ipytest

@pytest.mark.parametrize('df, year, expected', [
    (clean, 2019, 0),
]) 
def test_num_covid_in_year(df, year, expected):
    assert num_covid_in_year(df, year) == expected
    

.                                                                                            [100%]
1 passed in 0.01s

6.2 Studies included#

Test each year has at least 1 study included.
Test that there are no studies where the study included field is set to null.

%%ipytest

@pytest.mark.parametrize('df, year', [
    (clean, 2019),
    (clean, 2020),
    (clean, 2021),
    (clean, 2022),
]) 
def test_studies_included_in_year(df, year):
    '''
    Test the number of studies in a particular year
    of a subgroup is at least 1.
    '''
    assert num_included_in_year(df, year) >= 1
    

    
def test_all_studies_considered():
    '''
    Test that all studies have been coded as 0 or 1
    '''
    assert len(clean[clean['study_included'].isnull()]) == 0

.....                                                                                        [100%]
5 passed in 0.01s

6.3. NaN fields in mandatory data extraction fields#

Test that mandatory fields contain no NaN.

%%ipytest

@pytest.mark.parametrize('df, field', [
    (clean, 'study_included'),
    (clean, 'covid'),
    (clean, 'foss_sim'),
    (clean, 'reporting_guidelines_mention'),
    (clean, 'sim_software'),
]) 
def test_no_nan_fields_included_studies(df, field):
    '''
    Test that a selected field contains no NaN data.
    '''
    assert nan_fields(df, field) == 0

.....                                                                                        [100%]
5 passed in 0.01s

6.4 Binary fields only contain 0 or 1#

covid and mode_code_available fields are mandatory 0 or 1.

def non_zero_one_coding(df, field):
    return len(df[(df[field] != 1) & (df[field] != 0) & 
                  (df['study_included'] == 1)])

%%ipytest

@pytest.mark.parametrize('df, field', [
    (clean, 'covid'),
    (clean, 'model_code_available'),
]) 
def test_zero_one_coding(df, field):
    assert non_zero_one_coding(df, field) == 0

..                                                                                           [100%]
2 passed in 0.01s

Model and code sharing practices in healthcare discrete-event simulation - a systematic review

Data Testing

Contents

Data Testing#

1. Imports#

1.1. Standard Imports#

1.2 Testing imports#

1.2 Imports from preprocessing module#

2. Constants#

3. Functions#

4. Smoke Testing#

4.1. Data Files Exist#

4.2 Preprocessing logic returns DataFrame containing data#

5. Read in data#

6. Testing data fields and subgroups#

6.1 Check coding for Covid-19 models#

6.2 Studies included#

6.3. NaN fields in mandatory data extraction fields#

6.4 Binary fields only contain 0 or 1#