Data Testing
Contents
Data Testing#
This notebook contains some testing of our data sources to identify potential data coding errors in the main and best practice datasets.
We include the following groupings for testing
Smoke tests: These test that the data files exist and can be loaded for analysis. Nothing can run without passing these tests.
Testing data fields and subgroups: These test that the data is coded correctly and completely.
1. Imports#
1.1. Standard Imports#
import pandas as pd
import numpy as np
# used in smoke tests
import requests
1.2 Testing imports#
import ipytest
import pytest
ipytest.autoconfig()
1.2 Imports from preprocessing module#
# function for loading full dataset
from preprocessing import load_clean_dataset, load_clean_bpa
2. Constants#
FILE_NAME_MAIN = 'https://raw.githubusercontent.com/TomMonks/' \
+ 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'
FILE_NAME_BPA = 'https://raw.githubusercontent.com/TomMonks/' \
+ 'des_sharing_lit_review/main/data/bp_audit.zip'
3. Functions#
def num_covid_in_year(df, year):
'''
Return the integer number of studies with a Covid-19 DES model
in a specified publication year.
Params:
-----
df: pd.DataFrame
Subgroup dataset
year: int
Year of publication
Returns:
-------
int
'''
return len(df[(df['pub_yr']==year) & (df['covid'] == 1)])
def num_included_in_year(df, year):
'''
Return the integer number of studies included
in a specified publication year.
Params:
-----
df: pd.DataFrame
Subgroup dataset
year: int
Year of publication
Returns:
-------
int
'''
return len(df[(df['pub_yr']==year) & (df['study_included'] == 1)])
def nan_fields(df, field):
'''
Return the dataframe containing all rows
where study_included is NaN (empty)
Params:
-----
df: pd.DataFrame
Subgroup dataset
field: str
The name of the field to check for NaNs
Returns:
-------
int
'''
return len(df[(df[field].isnull()) & (df['study_included'] == 1)])
4. Smoke Testing#
4.1. Data Files Exist#
First check if the data files exist at the specified URLs.
%%ipytest
@pytest.mark.parametrize('url', [
(FILE_NAME_MAIN),
(FILE_NAME_BPA),
])
def test_file_exists_at_url(url):
'''
Request data file from URL. Test is status code = 200 (exists)
Params:
------
url: str
URL to test
'''
r = requests.head(url, allow_redirects=True)
# if code = 200 then resource exists.
assert r.status_code == 200
.. [100%]
2 passed in 0.55s
4.2 Preprocessing logic returns DataFrame containing data#
Read in check for any data and that it loads into a
DataFrame
%%ipytest
@pytest.mark.parametrize('url, func', [
(FILE_NAME_MAIN, load_clean_dataset),
(FILE_NAME_BPA, load_clean_bpa),
])
def test_load_dataframe(url, func):
'''
Test that the preprocessing function can return a populated dataframe
for the specified URL.
tested by checking df has at least 1 row.
Params:
------
url: str
URL for file
func: object
Python function containing preprocessing logic. Assume that it returns
a dataframe.
'''
df = func(url)
assert len(df) > 0
.. [100%]
2 passed in 0.61s
5. Read in data#
clean = load_clean_dataset(FILE_NAME_MAIN)
clean_bpa = load_clean_bpa(FILE_NAME_BPA)
6. Testing data fields and subgroups#
6.1 Check coding for Covid-19 models#
There should be no Covid-19 DES models pre-2020.
%%ipytest
@pytest.mark.parametrize('df, year, expected', [
(clean, 2019, 0),
])
def test_num_covid_in_year(df, year, expected):
assert num_covid_in_year(df, year) == expected
. [100%]
1 passed in 0.01s
6.2 Studies included#
Test each year has at least 1 study included.
Test that there are no studies where the study included field is set to null.
%%ipytest
@pytest.mark.parametrize('df, year', [
(clean, 2019),
(clean, 2020),
(clean, 2021),
(clean, 2022),
])
def test_studies_included_in_year(df, year):
'''
Test the number of studies in a particular year
of a subgroup is at least 1.
'''
assert num_included_in_year(df, year) >= 1
def test_all_studies_considered():
'''
Test that all studies have been coded as 0 or 1
'''
assert len(clean[clean['study_included'].isnull()]) == 0
..... [100%]
5 passed in 0.01s
6.3. NaN fields in mandatory data extraction fields#
Test that mandatory fields contain no NaN.
%%ipytest
@pytest.mark.parametrize('df, field', [
(clean, 'study_included'),
(clean, 'covid'),
(clean, 'foss_sim'),
(clean, 'reporting_guidelines_mention'),
(clean, 'sim_software'),
])
def test_no_nan_fields_included_studies(df, field):
'''
Test that a selected field contains no NaN data.
'''
assert nan_fields(df, field) == 0
..... [100%]
5 passed in 0.01s
6.4 Binary fields only contain 0 or 1#
covid and mode_code_available fields are mandatory 0 or 1.
def non_zero_one_coding(df, field):
return len(df[(df[field] != 1) & (df[field] != 0) &
(df['study_included'] == 1)])
%%ipytest
@pytest.mark.parametrize('df, field', [
(clean, 'covid'),
(clean, 'model_code_available'),
])
def test_zero_one_coding(df, field):
assert non_zero_one_coding(df, field) == 0
.. [100%]
2 passed in 0.01s