Main summary
Contents
Main summary#
Primary research questions:#
These results presented in this notebook the following questions
What proportion of DES healthcare studies share code?
It also answers the sub-questions:
What proportion of these papers that use Free and Open Source Simulation and of these what number are shared?
What proportion of these papers that tackle Covid-19 and share their models?
Imports#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Imports from preprocessing module
from preprocessing import load_clean_dataset
Constants#
FILE_NAME = 'https://raw.githubusercontent.com/TomMonks/' \
+ 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'
Functions to genereate a high level summary table.#
We include three functions to generate a high level summary table.
The first calculates the summary metrics from the main table.
The second function creates a tabular view of the data split by item type.
The third function formats this as a table suitable for journal publication. In our paper this is Table 2.
def high_level_metrics(df, name='None'):
'''A simple high level summary of the review.
Returns a dict containing simple high level counts
and percentages in the data#
Params:
-------
df: pd.DataFrame
A cleaned dataset. Could be overall or subgroups/categories
Returns:
--------
dict
'''
results = {}
included = df[df['study_included'] == 1]
available = included[included['model_code_available'] == 1]
results['n_included'] = len(included[included['study_included'] == 1])
results['n_foss'] = len(included[included['foss_sim'] == '1'])
results['n_covid'] = len(included[included['covid'] == 1])
results['n_avail'] = len(included[included['model_code_available'] == 1])
results['n_foss_avail'] = len(available[available['foss_sim'] == '1'])
results['n_covid_avail'] = len(available[available['covid'] == 1])
results['per_foss'] = results['n_foss'] / results['n_included']
results['per_covid'] = results['n_covid'] / results['n_included']
results['per_avail'] = results['n_avail'] / results['n_included']
results['per_foss_avail'] = results['n_foss_avail'] / results['n_foss']
results['per_covid_avail'] = results['n_covid_avail'] / results['n_covid']
results['reporting_guide'] = len(included[included['reporting_guidelines_mention'] != 'None'])
results['per_reporting_guide'] = results['reporting_guide'] / results['n_included']
return pd.Series(results, name=name)
def analysis_by_item_type(df_clean, decimals=4):
'''
Conducts a high level analysis by item type: journal, conference, book
+ overall.
Params:
-------
df_clean: pd.DataFrame
Assumes a cleaned version of the dataset.
Returns:
-------
pd.DataFrame
Containing the result summary
'''
overall_results = high_level_metrics(df_clean, 'overall')
article_type_results = []
article_types = df_clean['item_type'].unique().tolist()
for article_type in article_types:
subset = df_clean[df_clean['item_type'] == article_type]
article_type_results.append(high_level_metrics(subset,
name=article_type))
article_type_results = [overall_results] + article_type_results
return pd.DataFrame(article_type_results).T.round(decimals)
def format_table(summary):
'''
Create a formatted table of results for a manuscript.
'''
total_rows = ['n_included', 'n_covid', 'n_foss']
avail_rows = ['n_avail', 'n_covid_avail', 'n_foss_avail']
per_rows = ['per_avail', 'per_covid_avail', 'per_foss_avail']
new_cols_titles = ['metric', 'overall', 'shared', 'per']
# only work with the overall column
selected_cols = ['overall'] # , 'journalArticle', 'conferencePaper', 'book']
overall = summary[selected_cols]
# total number of papers
totals = overall.loc[total_rows]
totals = totals.reset_index()
totals['overall'] = totals['overall'].map('{:,.0f}'.format)
# no. models that are available from the total
shared = overall.loc[avail_rows]
shared = shared.reset_index()
# percentage of papers
per = overall.loc[per_rows]
per = per.reset_index()
per = per * 100
# construct table and format columns in n (%) format
t1 = pd.concat([totals, shared['overall'], per['overall']], \
axis=1, ignore_index=True)
t1.columns = new_cols_titles
t1['shared n (\%)'] = t1['shared'].map('{:,.0f}'.format) \
+ ' (' + t1['per'].map('{:,.1f}'.format) + ')'
#t1['overall'] = t1['overall'].map('{:,.0f}')
to_drop = ['shared', 'per']
t1 = t1.drop(to_drop, axis=1)
t1.iat[0, 0] = 'Total'
t1.iat[1, 0] = 'COVID-19'
t1.iat[2, 0] = 'FOSS'
t1 = t1.set_index('metric')
return t1
Read in data#
clean = load_clean_dataset(FILE_NAME)
Results#
Create a high level summary of the main dataset#
results = analysis_by_item_type(clean)
table = format_table(results)
table
overall | shared n (\%) | |
---|---|---|
metric | ||
Total | 564 | 47 (8.3) |
COVID-19 | 69 | 17 (24.6) |
FOSS | 101 | 29 (28.7) |
LaTeX#
Here we generate the LaTeX to inset the table into a document.
print(table.style.to_latex())
\begin{tabular}{lll}
& overall & shared n (\%) \\
metric & & \\
Total & 564 & 47 (8.3) \\
COVID-19 & 69 & 17 (24.6) \\
FOSS & 101 & 29 (28.7) \\
\end{tabular}