# Import required libraries
import numpy as np
import pandas as pd
Reformat the tables
This notebook converts the provided results tables from wide to long format, and to remove the “sharing” of cells, so that each row contains all the information needed to identify it.
This is to make it easier for me to use and compare against these results during the reproduction.
Set up
# Create list of files
= [f'supp_tab{i}.csv' for i in range(2, 7)]
files files
['supp_tab2.csv',
'supp_tab3.csv',
'supp_tab4.csv',
'supp_tab5.csv',
'supp_tab6.csv']
Function to reformat tables
def reformat_table(filename):
'''
Converts the supplementary tables from Lim et al. into a useable format for
processing and comparison against.
Most of these require the same processing, with the exception of table 6,
which has workplace measures instead of end of day categories
Parameters:
----------
filename : string
Name of file containing the table to be reformatted
'''
= pd.read_csv(filename, header=None)
tab
# Rename second column, which contains number of staff per shift
={tab.columns[2]: 'staff_per_shift'}, inplace=True)
tab.rename(columns
# Preview head of dataframe
display(tab.head())
# Get name of column depending on whether this is table 6 or not
if filename == 'supp_tab6.csv':
= 'workplace_measure'
column else:
= 'end_of_day'
column
# Make copies of the first column, which we will extract end of day and
# number of shifts from
=1, column=column, value=tab[0])
tab.insert(loc=2, column='shifts_per_day', value=tab[0])
tab.insert(loc
# If this is supplementary table 6, do processing for workplace measures
if filename == 'supp_tab6.csv':
# Just keep rows that list the protective workplace measure used, and
# then fill rows based on value above
= ['Social distancing', 'Gloves',
measures 'Surgical mask', 'Gown', 'N95 mask']
~tab['workplace_measure'].isin(measures),
tab.loc['workplace_measure'] = np.nan
'workplace_measure'] = tab['workplace_measure'].ffill()
tab[# Otherwise, do processing for end of day
else:
# Keep the 'End of Day' rows, then remove that string, and populate NaN
# rows with the end of day int above them
'end_of_day'] = np.where(
tab['end_of_day'].str.contains('End of Day'),
tab['end_of_day'],
tab[
np.nan)'end_of_day'] = tab['end_of_day'].str.replace(
tab['End of Day ', '').ffill()
# Set the first instance to NaN (as it is not on row with values)
1, column] = np.nan
tab.loc[
# Repeat 'End of Day' process, but this time for 'No. of shift/day'
# Get the rows where it contains string 'No. of shift/day', keeping int
= pd.Series(np.where(
shift_a 'shifts_per_day'].str.contains('No. of shift'),
tab['shifts_per_day'],
tab[str.replace('No. of shift/ day', '')
np.nan)).# Get the rows where number of shifts is just given by an integer
= pd.to_numeric(tab['shifts_per_day'], errors='coerce')
shift_b # Combine into a single column, populate NaN with value from above, and
# replace column in dataframe
'shifts_per_day'] = pd.to_numeric(shift_a.fillna(shift_b)).ffill()
tab[
# Remove columns with unnecessary information
0], axis=1, inplace=True)
tab.drop(tab.columns[2], axis=1, inplace=True)
tab.drop(tab.columns[
# Drop row where all of the results are NaN
= tab[tab[tab.columns[3:18]].notna().any(axis=1)]
tab
# Fill value of staff strength based on column to left
0] = tab.loc[0].infer_objects(copy=False).ffill()
tab.loc[
# The first two rows contain the total number of staff strength and the
# frequency of staff change in days. We want to make those into the headers.
# We add a prefix to the numbers, then combine them to make column headers
= tab.loc[0].apply(
names_a lambda x: f'strength_{int(x)}_' if ~np.isnan(x) else x)
= tab.loc[1].apply(
names_b lambda x: f'staff_change_{int(x)}' if ~np.isnan(x) else x)
= list(tab.columns[0:3]) + list((names_a + names_b).dropna())
tab.columns
# Drop the first two rows (now used for column headers)
0, 1], inplace=True)
tab.drop([
# Transform from wide to long
= pd.melt(
reformat =[column, 'shifts_per_day', 'staff_per_shift'],
tab, id_vars='var', value_name='prop_infected')
var_name
# Seperate out strength and staff change
'strength', 'staff_change']] = reformat['var'].str.extract(
reformat[[r'strength_(\d+)_staff_change_(\d+)')
=['var'], inplace=True)
reformat.drop(columns
# Move the proportion infected column to the end
= reformat.pop('prop_infected')
proportion 'prop_infected'] = proportion
reformat[
# Make sure all columns (except workplace_measure) are numeric
= [i for i in reformat.columns if i not in ['workplace_measure']]
cols for col in cols:
= pd.to_numeric(reformat[col])
reformat[col]
# Sort in order that makes it easy to compare against original format
=[column, 'shifts_per_day', 'staff_per_shift',
reformat.sort_values(by'strength', 'staff_change'], inplace=True)
# Preview head of dataframe
10))
display(reformat.head(
# Save to csv
'.csv', '_reformat.csv'), index=False) reformat.to_csv(filename.replace(
Apply the function
for f in files:
reformat_table(f)
0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.20 | 0.20 | 0.3 | 0.40 | 0.30 | 0.10 | 0.10 | 0.20 | 0.20 | 0.15 | 0.03 | 0.05 | 0.13 | 0.10 | 0.13 |
3 | NaN | NaN | 10.0 | 0.20 | 0.25 | 0.4 | 0.43 | 0.45 | 0.07 | 0.10 | 0.20 | 0.20 | 0.20 | 0.03 | 0.05 | 0.15 | 0.13 | 0.15 |
4 | NaN | NaN | 20.0 | 0.31 | 0.33 | 0.5 | 0.50 | 0.50 | 0.07 | 0.11 | 0.25 | 0.25 | 0.25 | 0.03 | 0.08 | 0.17 | 0.17 | 0.17 |
end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
---|---|---|---|---|---|---|
0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.20 |
36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.20 |
72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.30 |
108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.40 |
144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.30 |
180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.10 |
252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.20 |
288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.20 |
324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.15 |
0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.10 | 0.10 | 0.20 | 0.10 | 0.20 | 0.05 | 0.05 | 0.10 | 0.05 | 0.05 | 0.03 | 0.03 | 0.07 | 0.07 | 0.03 |
3 | NaN | NaN | 10.0 | 0.10 | 0.10 | 0.15 | 0.12 | 0.15 | 0.03 | 0.03 | 0.07 | 0.05 | 0.07 | 0.02 | 0.02 | 0.03 | 0.05 | 0.03 |
4 | NaN | NaN | 20.0 | 0.07 | 0.07 | 0.20 | 0.17 | 0.17 | 0.03 | 0.04 | 0.07 | 0.07 | 0.07 | 0.02 | 0.02 | 0.05 | 0.05 | 0.06 |
end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
---|---|---|---|---|---|---|
0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.10 |
36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.10 |
72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.20 |
108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.10 |
144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.20 |
180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.05 |
216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.05 |
252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.10 |
288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.05 |
324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.05 |
0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.0 | 21.0 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.40 | 0.30 | 0.5 | 0.5 | 0.5 | 0.10 | 0.20 | 0.25 | 0.25 | 0.25 | 0.07 | 0.10 | 0.17 | 0.17 | 0.17 |
3 | NaN | NaN | 10.0 | 0.43 | 0.45 | 0.5 | 0.5 | 0.5 | 0.12 | 0.21 | 0.25 | 0.25 | 0.25 | 0.07 | 0.12 | 0.17 | 0.17 | 0.17 |
4 | NaN | NaN | 20.0 | 0.50 | 0.50 | 0.5 | 0.5 | 0.5 | 0.25 | 0.30 | 0.25 | 0.25 | 0.25 | 0.10 | 0.19 | 0.17 | 0.17 | 0.17 |
end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
---|---|---|---|---|---|---|
0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.40 |
36 | 7 | 1.0 | 5.0 | 2 | 3 | 0.30 |
72 | 7 | 1.0 | 5.0 | 2 | 7 | 0.50 |
108 | 7 | 1.0 | 5.0 | 2 | 14 | 0.50 |
144 | 7 | 1.0 | 5.0 | 2 | 21 | 0.50 |
180 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
216 | 7 | 1.0 | 5.0 | 4 | 3 | 0.20 |
252 | 7 | 1.0 | 5.0 | 4 | 7 | 0.25 |
288 | 7 | 1.0 | 5.0 | 4 | 14 | 0.25 |
324 | 7 | 1.0 | 5.0 | 4 | 21 | 0.25 |
0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
1 | End of Day 7 | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.0 | 14.0 | 21.0 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.25 | 0.30 | 0.3 | 0.3 | 0.4 | 0.10 | 0.10 | 0.15 | 0.15 | 0.15 | 0.03 | 0.07 | 0.10 | 0.13 | 0.13 |
3 | NaN | NaN | 10.0 | 0.20 | 0.35 | 0.4 | 0.4 | 0.4 | 0.07 | 0.12 | 0.20 | 0.23 | 0.23 | 0.03 | 0.08 | 0.14 | 0.15 | 0.15 |
4 | NaN | NaN | 20.0 | 0.47 | 0.62 | 0.5 | 0.5 | 0.5 | 0.09 | 0.28 | 0.25 | 0.25 | 0.25 | 0.04 | 0.15 | 0.17 | 0.17 | 0.17 |
end_of_day | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
---|---|---|---|---|---|---|
0 | 7 | 1.0 | 5.0 | 2 | 1 | 0.25 |
12 | 7 | 1.0 | 5.0 | 2 | 3 | 0.30 |
24 | 7 | 1.0 | 5.0 | 2 | 7 | 0.30 |
36 | 7 | 1.0 | 5.0 | 2 | 14 | 0.30 |
48 | 7 | 1.0 | 5.0 | 2 | 21 | 0.40 |
60 | 7 | 1.0 | 5.0 | 4 | 1 | 0.10 |
72 | 7 | 1.0 | 5.0 | 4 | 3 | 0.10 |
84 | 7 | 1.0 | 5.0 | 4 | 7 | 0.15 |
96 | 7 | 1.0 | 5.0 | 4 | 14 | 0.15 |
108 | 7 | 1.0 | 5.0 | 4 | 21 | 0.15 |
0 | 1 | staff_per_shift | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Total number of staff strength | NaN | NaN | 2.00 | NaN | NaN | NaN | NaN | 4.00 | NaN | NaN | NaN | NaN | 6.00 | NaN | NaN | NaN | NaN |
1 | Social distancing | Freq. of staff change, days | NaN | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 | 1.00 | 3.00 | 7.00 | 14.00 | 21.00 |
2 | No. of shift/ day 1 | Number of staff per shift | 5.0 | 0.20 | 0.20 | 0.20 | 0.35 | 0.35 | 0.05 | 0.10 | 0.10 | 0.20 | 0.17 | 0.03 | 0.03 | 0.07 | 0.12 | 0.13 |
3 | NaN | NaN | 10.0 | 0.20 | 0.25 | 0.20 | 0.45 | 0.45 | 0.05 | 0.07 | 0.10 | 0.23 | 0.20 | 0.03 | 0.03 | 0.08 | 0.15 | 0.13 |
4 | NaN | NaN | 20.0 | 0.31 | 0.38 | 0.33 | 0.50 | 0.50 | 0.07 | 0.12 | 0.16 | 0.25 | 0.25 | 0.03 | 0.06 | 0.11 | 0.17 | 0.17 |
workplace_measure | shifts_per_day | staff_per_shift | strength | staff_change | prop_infected | |
---|---|---|---|---|---|---|
12 | Gloves | 1.0 | 5.0 | 2 | 1 | 0.20 |
72 | Gloves | 1.0 | 5.0 | 2 | 3 | 0.20 |
132 | Gloves | 1.0 | 5.0 | 2 | 7 | 0.20 |
192 | Gloves | 1.0 | 5.0 | 2 | 14 | 0.30 |
252 | Gloves | 1.0 | 5.0 | 2 | 21 | 0.30 |
312 | Gloves | 1.0 | 5.0 | 4 | 1 | 0.05 |
372 | Gloves | 1.0 | 5.0 | 4 | 3 | 0.05 |
432 | Gloves | 1.0 | 5.0 | 4 | 7 | 0.10 |
492 | Gloves | 1.0 | 5.0 | 4 | 14 | 0.15 |
552 | Gloves | 1.0 | 5.0 | 4 | 21 | 0.15 |