# Import required libraries
import numpy as np
import pandas as pdReformat the tables
This notebook converts the provided results tables from wide to long format, and to remove the “sharing” of cells, so that each row contains all the information needed to identify it.
This is to make it easier for me to use and compare against these results during the reproduction.
Set up
# Create list of files
files = [f'supp_tab{i}.csv' for i in range(2, 7)]
files['supp_tab2.csv',
'supp_tab3.csv',
'supp_tab4.csv',
'supp_tab5.csv',
'supp_tab6.csv']
Function to reformat tables
def reformat_table(filename):
'''
Converts the supplementary tables from Lim et al. into a useable format for
processing and comparison against.
Most of these require the same processing, with the exception of table 6,
which has workplace measures instead of end of day categories
Parameters:
----------
filename : string
Name of file containing the table to be reformatted
'''
tab = pd.read_csv(filename, header=None)
# Rename second column, which contains number of staff per shift
tab.rename(columns={tab.columns[2]: 'staff_per_shift'}, inplace=True)
# Preview head of dataframe
display(tab.head())
# Get name of column depending on whether this is table 6 or not
if filename == 'supp_tab6.csv':
column = 'workplace_measure'
else:
column = 'end_of_day'
# Make copies of the first column, which we will extract end of day and
# number of shifts from
tab.insert(loc=1, column=column, value=tab[0])
tab.insert(loc=2, column='shifts_per_day', value=tab[0])
# If this is supplementary table 6, do processing for workplace measures
if filename == 'supp_tab6.csv':
# Just keep rows that list the protective workplace measure used, and
# then fill rows based on value above
measures = ['Social distancing', 'Gloves',
'Surgical mask', 'Gown', 'N95 mask']
tab.loc[~tab['workplace_measure'].isin(measures),
'workplace_measure'] = np.nan
tab['workplace_measure'] = tab['workplace_measure'].ffill()
# Otherwise, do processing for end of day
else:
# Keep the 'End of Day' rows, then remove that string, and populate NaN
# rows with the end of day int above them
tab['end_of_day'] = np.where(
tab['end_of_day'].str.contains('End of Day'),
tab['end_of_day'],
np.nan)
tab['end_of_day'] = tab['end_of_day'].str.replace(
'End of Day ', '').ffill()
# Set the first instance to NaN (as it is not on row with values)
tab.loc[1, column] = np.nan
# Repeat 'End of Day' process, but this time for 'No. of shift/day'
# Get the rows where it contains string 'No. of shift/day', keeping int
shift_a = pd.Series(np.where(
tab['shifts_per_day'].str.contains('No. of shift'),
tab['shifts_per_day'],
np.nan)).str.replace('No. of shift/ day', '')
# Get the rows where number of shifts is just given by an integer
shift_b = pd.to_numeric(tab['shifts_per_day'], errors='coerce')
# Combine into a single column, populate NaN with value from above, and
# replace column in dataframe
tab['shifts_per_day'] = pd.to_numeric(shift_a.fillna(shift_b)).ffill()
# Remove columns with unnecessary information
tab.drop(tab.columns[0], axis=1, inplace=True)
tab.drop(tab.columns[2], axis=1, inplace=True)
# Drop row where all of the results are NaN
tab = tab[tab[tab.columns[3:18]].notna().any(axis=1)]
# Fill value of staff strength based on column to left
tab.loc[0] = tab.loc[0].infer_objects(copy=False).ffill()
# The first two rows contain the total number of staff strength and the
# frequency of staff change in days. We want to make those into the headers.
# We add a prefix to the numbers, then combine them to make column headers
names_a = tab.loc[0].apply(
lambda x: f'strength_{int(x)}_' if ~np.isnan(x) else x)
names_b = tab.loc[1].apply(
lambda x: f'staff_change_{int(x)}' if ~np.isnan(x) else x)
tab.columns = list(tab.columns[0:3]) + list((names_a + names_b).dropna())
# Drop the first two rows (now used for column headers)
tab.drop([0, 1], inplace=True)
# Transform from wide to long
reformat = pd.melt(
tab, id_vars=[column, 'shifts_per_day', 'staff_per_shift'],
var_name='var', value_name='prop_infected')
# Seperate out strength and staff change
reformat[['strength', 'staff_change']] = reformat['var'].str.extract(
r'strength_(\d+)_staff_change_(\d+)')
reformat.drop(columns=['var'], inplace=True)
# Move the proportion infected column to the end
proportion = reformat.pop('prop_infected')
reformat['prop_infected'] = proportion
# Make sure all columns (except workplace_measure) are numeric
cols = [i for i in reformat.columns if i not in ['workplace_measure']]
for col in cols:
reformat[col] = pd.to_numeric(reformat[col])
# Sort in order that makes it easy to compare against original format
reformat.sort_values(by=[column, 'shifts_per_day', 'staff_per_shift',
'strength', 'staff_change'], inplace=True)
# Preview head of dataframe
display(reformat.head(10))
# Save to csv
reformat.to_csv(filename.replace('.csv', '_reformat.csv'), index=False)Apply the function
for f in files:
reformat_table(f)| 0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
| 1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
| 2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.20 | 0.20 | 0.3 | 0.40 | 0.30 | 0.10 | 0.10 | 0.20 | 0.20 | 0.15 | 0.03 | 0.05 | 0.13 | 0.10 | 0.13 |
| 3 | NaN | NaN | 10.0 | 0.20 | 0.25 | 0.4 | 0.43 | 0.45 | 0.07 | 0.10 | 0.20 | 0.20 | 0.20 | 0.03 | 0.05 | 0.15 | 0.13 | 0.15 |
| 4 | NaN | NaN | 20.0 | 0.31 | 0.33 | 0.5 | 0.50 | 0.50 | 0.07 | 0.11 | 0.25 | 0.25 | 0.25 | 0.03 | 0.08 | 0.17 | 0.17 | 0.17 |
| end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
|---|---|---|---|---|---|---|
| 0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.20 |
| 36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.20 |
| 72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.30 |
| 108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.40 |
| 144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.30 |
| 180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
| 216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.10 |
| 252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.20 |
| 288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.20 |
| 324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.15 |
| 0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
| 1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
| 2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.10 | 0.10 | 0.20 | 0.10 | 0.20 | 0.05 | 0.05 | 0.10 | 0.05 | 0.05 | 0.03 | 0.03 | 0.07 | 0.07 | 0.03 |
| 3 | NaN | NaN | 10.0 | 0.10 | 0.10 | 0.15 | 0.12 | 0.15 | 0.03 | 0.03 | 0.07 | 0.05 | 0.07 | 0.02 | 0.02 | 0.03 | 0.05 | 0.03 |
| 4 | NaN | NaN | 20.0 | 0.07 | 0.07 | 0.20 | 0.17 | 0.17 | 0.03 | 0.04 | 0.07 | 0.07 | 0.07 | 0.02 | 0.02 | 0.05 | 0.05 | 0.06 |
| end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
|---|---|---|---|---|---|---|
| 0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.10 |
| 36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.10 |
| 72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.20 |
| 108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.10 |
| 144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.20 |
| 180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.05 |
| 216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.05 |
| 252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.10 |
| 288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.05 |
| 324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.05 |
| 0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
| 1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.0 | 21.0 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
| 2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.40 | 0.30 | 0.5 | 0.5 | 0.5 | 0.10 | 0.20 | 0.25 | 0.25 | 0.25 | 0.07 | 0.10 | 0.17 | 0.17 | 0.17 |
| 3 | NaN | NaN | 10.0 | 0.43 | 0.45 | 0.5 | 0.5 | 0.5 | 0.12 | 0.21 | 0.25 | 0.25 | 0.25 | 0.07 | 0.12 | 0.17 | 0.17 | 0.17 |
| 4 | NaN | NaN | 20.0 | 0.50 | 0.50 | 0.5 | 0.5 | 0.5 | 0.25 | 0.30 | 0.25 | 0.25 | 0.25 | 0.10 | 0.19 | 0.17 | 0.17 | 0.17 |
| end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
|---|---|---|---|---|---|---|
| 0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.40 |
| 36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.30 |
| 72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.50 |
| 108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.50 |
| 144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.50 |
| 180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
| 216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.20 |
| 252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.25 |
| 288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.25 |
| 324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.25 |
| 0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
| 1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.0 | 21.0 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
| 2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.25 | 0.30 | 0.3 | 0.3 | 0.4 | 0.10 | 0.10 | 0.15 | 0.15 | 0.15 | 0.03 | 0.07 | 0.10 | 0.13 | 0.13 |
| 3 | NaN | NaN | 10.0 | 0.20 | 0.35 | 0.4 | 0.4 | 0.4 | 0.07 | 0.12 | 0.20 | 0.23 | 0.23 | 0.03 | 0.08 | 0.14 | 0.15 | 0.15 |
| 4 | NaN | NaN | 20.0 | 0.47 | 0.62 | 0.5 | 0.5 | 0.5 | 0.09 | 0.28 | 0.25 | 0.25 | 0.25 | 0.04 | 0.15 | 0.17 | 0.17 | 0.17 |
| end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
|---|---|---|---|---|---|---|
| 0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.25 |
| 12 | 7 | 1.0 | 5.0 | 2 | 3 | 0.30 |
| 24 | 7 | 1.0 | 5.0 | 2 | 7 | 0.30 |
| 36 | 7 | 1.0 | 5.0 | 2 | 14 | 0.30 |
| 48 | 7 | 1.0 | 5.0 | 2 | 21 | 0.40 |
| 60 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
| 72 | 7 | 1.0 | 5.0 | 4 | 3 | 0.10 |
| 84 | 7 | 1.0 | 5.0 | 4 | 7 | 0.15 |
| 96 | 7 | 1.0 | 5.0 | 4 | 14 | 0.15 |
| 108 | 7 | 1.0 | 5.0 | 4 | 21 | 0.15 |
| 0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
| 1 | Social distancing | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
| 2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.20 | 0.20 | 0.20 | 0.35 | 0.35 | 0.05 | 0.10 | 0.10 | 0.20 | 0.17 | 0.03 | 0.03 | 0.07 | 0.12 | 0.13 |
| 3 | NaN | NaN | 10.0 | 0.20 | 0.25 | 0.20 | 0.45 | 0.45 | 0.05 | 0.07 | 0.10 | 0.23 | 0.20 | 0.03 | 0.03 | 0.08 | 0.15 | 0.13 |
| 4 | NaN | NaN | 20.0 | 0.31 | 0.38 | 0.33 | 0.50 | 0.50 | 0.07 | 0.12 | 0.16 | 0.25 | 0.25 | 0.03 | 0.06 | 0.11 | 0.17 | 0.17 |
| workplace_measure | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
|---|---|---|---|---|---|---|
| 12 | Gloves | 1.0 | 5.0 | 2 | 1 | 0.20 |
| 72 | Gloves | 1.0 | 5.0 | 2 | 3 | 0.20 |
| 132 | Gloves | 1.0 | 5.0 | 2 | 7 | 0.20 |
| 192 | Gloves | 1.0 | 5.0 | 2 | 14 | 0.30 |
| 252 | Gloves | 1.0 | 5.0 | 2 | 21 | 0.30 |
| 312 | Gloves | 1.0 | 5.0 | 4 | 1 | 0.05 |
| 372 | Gloves | 1.0 | 5.0 | 4 | 3 | 0.05 |
| 432 | Gloves | 1.0 | 5.0 | 4 | 7 | 0.10 |
| 492 | Gloves | 1.0 | 5.0 | 4 | 14 | 0.15 |
| 552 | Gloves | 1.0 | 5.0 | 4 | 21 | 0.15 |