Reformat the tables

This notebook converts the provided results tables from wide to long format, and to remove the “sharing” of cells, so that each row contains all the information needed to identify it.

This is to make it easier for me to use and compare against these results during the reproduction.

Set up

# Import required libraries
import numpy as np
import pandas as pd
# Create list of files
files = [f'supp_tab{i}.csv' for i in range(2, 7)]
files
['supp_tab2.csv',
 'supp_tab3.csv',
 'supp_tab4.csv',
 'supp_tab5.csv',
 'supp_tab6.csv']

Function to reformat tables

def reformat_table(filename):
    '''
    Converts the supplementary tables from Lim et al. into a useable format for
    processing and comparison against.

    Most of these require the same processing, with the exception of table 6,
    which has workplace measures instead of end of day categories

    Parameters:
    ----------
    filename : string
        Name of file containing the table to be reformatted
    '''
    tab = pd.read_csv(filename, header=None)

    # Rename second column, which contains number of staff per shift
    tab.rename(columns={tab.columns[2]: 'staff_per_shift'}, inplace=True)

    # Preview head of dataframe
    display(tab.head())

    # Get name of column depending on whether this is table 6 or not
    if filename == 'supp_tab6.csv':
        column = 'workplace_measure'
    else:
        column = 'end_of_day'

    # Make copies of the first column, which we will extract end of day and
    # number of shifts from
    tab.insert(loc=1, column=column, value=tab[0])
    tab.insert(loc=2, column='shifts_per_day', value=tab[0])

    # If this is supplementary table 6, do processing for workplace measures
    if filename == 'supp_tab6.csv':
        # Just keep rows that list the protective workplace measure used, and
        # then fill rows based on value above
        measures = ['Social distancing', 'Gloves',
                    'Surgical mask', 'Gown', 'N95 mask']
        tab.loc[~tab['workplace_measure'].isin(measures),
                'workplace_measure'] = np.nan
        tab['workplace_measure'] = tab['workplace_measure'].ffill()
    # Otherwise, do processing for end of day
    else:
        # Keep the 'End of Day' rows, then remove that string, and populate NaN
        # rows with the end of day int above them
        tab['end_of_day'] = np.where(
            tab['end_of_day'].str.contains('End of Day'),
            tab['end_of_day'],
            np.nan)
        tab['end_of_day'] = tab['end_of_day'].str.replace(
            'End of Day ', '').ffill()

    # Set the first instance to NaN (as it is not on row with values)
    tab.loc[1, column] = np.nan

    # Repeat 'End of Day' process, but this time for 'No. of shift/day'
    # Get the rows where it contains string 'No. of shift/day', keeping int
    shift_a = pd.Series(np.where(
        tab['shifts_per_day'].str.contains('No. of shift'),
        tab['shifts_per_day'],
        np.nan)).str.replace('No. of shift/ day', '')
    # Get the rows where number of shifts is just given by an integer
    shift_b = pd.to_numeric(tab['shifts_per_day'], errors='coerce')
    # Combine into a single column, populate NaN with value from above, and
    # replace column in dataframe
    tab['shifts_per_day'] = pd.to_numeric(shift_a.fillna(shift_b)).ffill()

    # Remove columns with unnecessary information
    tab.drop(tab.columns[0], axis=1, inplace=True)
    tab.drop(tab.columns[2], axis=1, inplace=True)

    # Drop row where all of the results are NaN
    tab = tab[tab[tab.columns[3:18]].notna().any(axis=1)]

    # Fill value of staff strength based on column to left
    tab.loc[0] = tab.loc[0].infer_objects(copy=False).ffill()

    # The first two rows contain the total number of staff strength and the
    # frequency of staff change in days. We want to make those into the headers.
    # We add a prefix to the numbers, then combine them to make column headers
    names_a = tab.loc[0].apply(
        lambda x: f'strength_{int(x)}_' if ~np.isnan(x) else x)
    names_b = tab.loc[1].apply(
        lambda x: f'staff_change_{int(x)}' if ~np.isnan(x) else x)
    tab.columns = list(tab.columns[0:3]) + list((names_a + names_b).dropna())

    # Drop the first two rows (now used for column headers)
    tab.drop([0, 1], inplace=True)

    # Transform from wide to long
    reformat = pd.melt(
        tab, id_vars=[column, 'shifts_per_day', 'staff_per_shift'],
        var_name='var', value_name='prop_infected')

    # Seperate out strength and staff change
    reformat[['strength', 'staff_change']] = reformat['var'].str.extract(
        r'strength_(\d+)_staff_change_(\d+)')
    reformat.drop(columns=['var'], inplace=True)

    # Move the proportion infected column to the end
    proportion = reformat.pop('prop_infected')
    reformat['prop_infected'] = proportion

    # Make sure all columns (except workplace_measure) are numeric
    cols = [i for i in reformat.columns if i not in ['workplace_measure']]
    for col in cols:
        reformat[col] = pd.to_numeric(reformat[col])

    # Sort in order that makes it easy to compare against original format
    reformat.sort_values(by=[column, 'shifts_per_day', 'staff_per_shift',
                             'strength', 'staff_change'], inplace=True)

    # Preview head of dataframe
    display(reformat.head(10))

    # Save to csv
    reformat.to_csv(filename.replace('.csv', '_reformat.csv'), index=False)

Apply the function

for f in files:
    reformat_table(f)
0 1 staff_per_shift 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 Total number of staff strength NaN NaN 2.00 NaN NaN NaN NaN 4.00 NaN NaN NaN NaN 6.00 NaN NaN NaN NaN
1 End of Day 7 Freq. of staff change, days NaN 1.00 3.00 7.0 14.00 21.00 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00
2 No. of shift/ day 1 Number of staff per shift 5.0 0.20 0.20 0.3 0.40 0.30 0.10 0.10 0.20 0.20 0.15 0.03 0.05 0.13 0.10 0.13
3 NaN NaN 10.0 0.20 0.25 0.4 0.43 0.45 0.07 0.10 0.20 0.20 0.20 0.03 0.05 0.15 0.13 0.15
4 NaN NaN 20.0 0.31 0.33 0.5 0.50 0.50 0.07 0.11 0.25 0.25 0.25 0.03 0.08 0.17 0.17 0.17
end_of_day shifts_per_day staff_per_shift strength staff_change prop_infected
0 7 1.0 5.0 2 1 0.20
36 7 1.0 5.0 2 3 0.20
72 7 1.0 5.0 2 7 0.30
108 7 1.0 5.0 2 14 0.40
144 7 1.0 5.0 2 21 0.30
180 7 1.0 5.0 4 1 0.10
216 7 1.0 5.0 4 3 0.10
252 7 1.0 5.0 4 7 0.20
288 7 1.0 5.0 4 14 0.20
324 7 1.0 5.0 4 21 0.15
0 1 staff_per_shift 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 Total number of staff strength NaN NaN 2.00 NaN NaN NaN NaN 4.00 NaN NaN NaN NaN 6.00 NaN NaN NaN NaN
1 End of Day 7 Freq. of staff change, days NaN 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00
2 No. of shift/ day 1 Number of staff per shift 5.0 0.10 0.10 0.20 0.10 0.20 0.05 0.05 0.10 0.05 0.05 0.03 0.03 0.07 0.07 0.03
3 NaN NaN 10.0 0.10 0.10 0.15 0.12 0.15 0.03 0.03 0.07 0.05 0.07 0.02 0.02 0.03 0.05 0.03
4 NaN NaN 20.0 0.07 0.07 0.20 0.17 0.17 0.03 0.04 0.07 0.07 0.07 0.02 0.02 0.05 0.05 0.06
end_of_day shifts_per_day staff_per_shift strength staff_change prop_infected
0 7 1.0 5.0 2 1 0.10
36 7 1.0 5.0 2 3 0.10
72 7 1.0 5.0 2 7 0.20
108 7 1.0 5.0 2 14 0.10
144 7 1.0 5.0 2 21 0.20
180 7 1.0 5.0 4 1 0.05
216 7 1.0 5.0 4 3 0.05
252 7 1.0 5.0 4 7 0.10
288 7 1.0 5.0 4 14 0.05
324 7 1.0 5.0 4 21 0.05
0 1 staff_per_shift 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 Total number of staff strength NaN NaN 2.00 NaN NaN NaN NaN 4.00 NaN NaN NaN NaN 6.00 NaN NaN NaN NaN
1 End of Day 7 Freq. of staff change, days NaN 1.00 3.00 7.0 14.0 21.0 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00
2 No. of shift/ day 1 Number of staff per shift 5.0 0.40 0.30 0.5 0.5 0.5 0.10 0.20 0.25 0.25 0.25 0.07 0.10 0.17 0.17 0.17
3 NaN NaN 10.0 0.43 0.45 0.5 0.5 0.5 0.12 0.21 0.25 0.25 0.25 0.07 0.12 0.17 0.17 0.17
4 NaN NaN 20.0 0.50 0.50 0.5 0.5 0.5 0.25 0.30 0.25 0.25 0.25 0.10 0.19 0.17 0.17 0.17
end_of_day shifts_per_day staff_per_shift strength staff_change prop_infected
0 7 1.0 5.0 2 1 0.40
36 7 1.0 5.0 2 3 0.30
72 7 1.0 5.0 2 7 0.50
108 7 1.0 5.0 2 14 0.50
144 7 1.0 5.0 2 21 0.50
180 7 1.0 5.0 4 1 0.10
216 7 1.0 5.0 4 3 0.20
252 7 1.0 5.0 4 7 0.25
288 7 1.0 5.0 4 14 0.25
324 7 1.0 5.0 4 21 0.25
0 1 staff_per_shift 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 Total number of staff strength NaN NaN 2.00 NaN NaN NaN NaN 4.00 NaN NaN NaN NaN 6.00 NaN NaN NaN NaN
1 End of Day 7 Freq. of staff change, days NaN 1.00 3.00 7.0 14.0 21.0 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00
2 No. of shift/ day 1 Number of staff per shift 5.0 0.25 0.30 0.3 0.3 0.4 0.10 0.10 0.15 0.15 0.15 0.03 0.07 0.10 0.13 0.13
3 NaN NaN 10.0 0.20 0.35 0.4 0.4 0.4 0.07 0.12 0.20 0.23 0.23 0.03 0.08 0.14 0.15 0.15
4 NaN NaN 20.0 0.47 0.62 0.5 0.5 0.5 0.09 0.28 0.25 0.25 0.25 0.04 0.15 0.17 0.17 0.17
end_of_day shifts_per_day staff_per_shift strength staff_change prop_infected
0 7 1.0 5.0 2 1 0.25
12 7 1.0 5.0 2 3 0.30
24 7 1.0 5.0 2 7 0.30
36 7 1.0 5.0 2 14 0.30
48 7 1.0 5.0 2 21 0.40
60 7 1.0 5.0 4 1 0.10
72 7 1.0 5.0 4 3 0.10
84 7 1.0 5.0 4 7 0.15
96 7 1.0 5.0 4 14 0.15
108 7 1.0 5.0 4 21 0.15
0 1 staff_per_shift 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
0 Total number of staff strength NaN NaN 2.00 NaN NaN NaN NaN 4.00 NaN NaN NaN NaN 6.00 NaN NaN NaN NaN
1 Social distancing Freq. of staff change, days NaN 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00 1.00 3.00 7.00 14.00 21.00
2 No. of shift/ day 1 Number of staff per shift 5.0 0.20 0.20 0.20 0.35 0.35 0.05 0.10 0.10 0.20 0.17 0.03 0.03 0.07 0.12 0.13
3 NaN NaN 10.0 0.20 0.25 0.20 0.45 0.45 0.05 0.07 0.10 0.23 0.20 0.03 0.03 0.08 0.15 0.13
4 NaN NaN 20.0 0.31 0.38 0.33 0.50 0.50 0.07 0.12 0.16 0.25 0.25 0.03 0.06 0.11 0.17 0.17
workplace_measure shifts_per_day staff_per_shift strength staff_change prop_infected
12 Gloves 1.0 5.0 2 1 0.20
72 Gloves 1.0 5.0 2 3 0.20
132 Gloves 1.0 5.0 2 7 0.20
192 Gloves 1.0 5.0 2 14 0.30
252 Gloves 1.0 5.0 2 21 0.30
312 Gloves 1.0 5.0 4 1 0.05
372 Gloves 1.0 5.0 4 3 0.05
432 Gloves 1.0 5.0 4 7 0.10
492 Gloves 1.0 5.0 4 14 0.15
552 Gloves 1.0 5.0 4 21 0.15