Main summary#

Primary research questions:#

These results presented in this notebook the following questions

  1. What proportion of DES healthcare studies share code?

It also answers the sub-questions:

  • What proportion of these papers that use Free and Open Source Simulation and of these what number are shared?

  • What proportion of these papers that tackle Covid-19 and share their models?

Imports#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Imports from preprocessing module
from preprocessing import load_clean_dataset

Constants#

FILE_NAME = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'

Functions to genereate a high level summary table.#

We include three functions to generate a high level summary table.

  • The first calculates the summary metrics from the main table.

  • The second function creates a tabular view of the data split by item type.

  • The third function formats this as a table suitable for journal publication. In our paper this is Table 2.

def high_level_metrics(df, name='None'):
    '''A simple high level summary of the review.
    
    Returns a dict containing simple high level counts
    and percentages in the data#
    
    Params:
    -------
    df: pd.DataFrame 
        A cleaned dataset.  Could be overall or subgroups/categories
        
    Returns:
    --------
        dict 
    '''
    results = {}
    included = df[df['study_included'] == 1]
    available = included[included['model_code_available'] == 1]
    results['n_included'] = len(included[included['study_included'] == 1])
    results['n_foss'] = len(included[included['foss_sim'] == '1'])
    results['n_covid'] = len(included[included['covid'] == 1])
    results['n_avail'] = len(included[included['model_code_available'] == 1])
    results['n_foss_avail'] = len(available[available['foss_sim'] == '1'])
    results['n_covid_avail'] = len(available[available['covid'] == 1])
    results['per_foss'] = results['n_foss'] / results['n_included']
    results['per_covid'] = results['n_covid'] / results['n_included']
    results['per_avail'] = results['n_avail'] / results['n_included']
    results['per_foss_avail'] = results['n_foss_avail'] / results['n_foss']
    results['per_covid_avail'] = results['n_covid_avail'] / results['n_covid']
    results['reporting_guide'] = len(included[included['reporting_guidelines_mention'] != 'None'])
    results['per_reporting_guide'] = results['reporting_guide'] / results['n_included']
    return pd.Series(results, name=name)
def analysis_by_item_type(df_clean, decimals=4):
    '''
    Conducts a high level analysis by item type: journal, conference, book
    + overall.
    
    Params:
    -------
    df_clean: pd.DataFrame
        Assumes a cleaned version of the dataset.
    
    Returns: 
    -------
    pd.DataFrame
        Containing the result summary
        
    '''
    overall_results = high_level_metrics(df_clean, 'overall')
    article_type_results = []
    article_types = df_clean['item_type'].unique().tolist()
    for article_type in article_types:
        subset = df_clean[df_clean['item_type'] == article_type]
        article_type_results.append(high_level_metrics(subset, 
                                                       name=article_type))
    article_type_results = [overall_results] + article_type_results
    return pd.DataFrame(article_type_results).T.round(decimals)
def format_table(summary):
    '''
    Create a formatted table of results for a manuscript.
    '''
    total_rows = ['n_included', 'n_covid', 'n_foss']
    avail_rows = ['n_avail', 'n_covid_avail', 'n_foss_avail']
    per_rows = ['per_avail', 'per_covid_avail', 'per_foss_avail']
    new_cols_titles = ['metric', 'overall', 'shared', 'per']
       
    # only work with the overall column
    selected_cols = ['overall'] # , 'journalArticle', 'conferencePaper', 'book']
    overall = summary[selected_cols]
    
    # total number of papers
    totals = overall.loc[total_rows]
    totals = totals.reset_index()
    totals['overall'] = totals['overall'].map('{:,.0f}'.format)
    
    # no. models that are available from the total
    shared = overall.loc[avail_rows]
    shared = shared.reset_index()
    
    # percentage of papers 
    per = overall.loc[per_rows]
    per = per.reset_index()
    per = per * 100
        
    # construct table and format columns in n (%) format
    t1 = pd.concat([totals, shared['overall'], per['overall']], \
                   axis=1, ignore_index=True)

    t1.columns = new_cols_titles
    
    t1['shared n (\%)'] = t1['shared'].map('{:,.0f}'.format) \
        + ' (' + t1['per'].map('{:,.1f}'.format) + ')'
    
    #t1['overall'] = t1['overall'].map('{:,.0f}')
    
    to_drop = ['shared', 'per']
    t1 = t1.drop(to_drop, axis=1)
    t1.iat[0, 0] = 'Total'
    t1.iat[1, 0] = 'COVID-19'
    t1.iat[2, 0] = 'FOSS'
    t1 = t1.set_index('metric')
    return t1
    

Read in data#

clean = load_clean_dataset(FILE_NAME)

Results#

Create a high level summary of the main dataset#

results = analysis_by_item_type(clean)
table = format_table(results)
table
overall shared n (\%)
metric
Total 564 47 (8.3)
COVID-19 69 17 (24.6)
FOSS 101 29 (28.7)

LaTeX#

Here we generate the LaTeX to inset the table into a document.

print(table.style.to_latex())
\begin{tabular}{lll}
 & overall & shared n (\%) \\
metric &  &  \\
Total & 564 & 47 (8.3) \\
COVID-19 & 69 & 17 (24.6) \\
FOSS & 101 & 29 (28.7) \\
\end{tabular}