Model output comparison

Model output comparison#

Here we run the CCU models for stage one and two (our internal replication test) and compare the quantitative results.

This notebook assume that it is in the same directory as the following Python modules:

ccu_formatted_code (produced in stage 1 of the study)
ccu_formatted_code_stage2 (produced in the internal replication test)

1. Imports#

1.1 General libraries#

import numpy as np
import pandas as pd
import scipy.stats as stats

1.2 CCU stage 1 code#

from ccu_formatted_code import (
    get_experiments, 
    run_all_experiments,
    summary_of_experiments
)

1.3. CCU stage 2 code#

Note that we give stage 2 code alias for their functions (as the function and class names same in both modules).

from ccu_formatted_code_stage2 import (
    get_experiments as get_experimentsS2,
    run_all_experiments as run_all_experimentsS2,
    summary_of_experiments as summary_of_experimentsS2
)

2. Table Formatting functions#

These functions modify and combine the outputs from two models so that they are suitable for an academic paper.

def format_stage_table(model_results: pd.DataFrame, stage: str) -> pd.DataFrame:
    """
    Each model produces a slightly different format of results table 
    (different headers). This function updates to a common col and row 
    header format.
    """
    # recode KPI names
    common_var_names = {
        "Total Cancelled Elective Operations": 
            "1. Cancelled Elective Operations",
        "Cancelled Elective Operations": "1. Cancelled Elective Operations",
        "Bed Utilization": "2. Bed Utilization",
        "Bed Occupancy": "3. Bed Occupancy",
        "Mean Waiting Time Unplanned": 
            "4. Mean Unplanned Admission Waiting Time (hours)",
        "Patient Count": "0. Patient Count",
        "Mean Unplanned Admission Waiting Time (hours)":
            "4. Mean Unplanned Admission Waiting Time (hours)"
    }
    
    # record stage 1 column names to be identical to stage 2
    recoded_columns_names = {i:f'{name}' for i, 
                             name in zip(range(len(results_stage_2.columns)), 
                                         results_stage_2.columns)}

    # create common format
    df = (model_results
          .rename(common_var_names)
          .pipe(lambda d: d.set_axis(range(d.shape[1]), axis=1))
          .rename(columns=recoded_columns_names)
          .sort_index()
          )
    # give columns simpler headers for mean (std)
    for i in range(23, 29):
        df[f'{i} beds'] = df[f'Experiment with {i} beds_mean'].map('{:,.1f}'.format) \
            + ' (' + df[f'Experiment with {i} beds_std'].map('{:,.2f}'.format) \
            + ")"
        
        df = df.drop([f'Experiment with {i} beds_mean', 
                      f'Experiment with {i} beds_std'], axis=1)

    df['Study Stage'] = [stage for _ in range(len(df))]
    df[['Study Stage'] + df.columns[:-1].tolist()]
    df = df.reset_index()
    df = df.rename(columns={"index":"metric"})
    df = df.set_index('Study Stage')
    return df

def create_formatted_latex_table(formatted_stage_1: pd.DataFrame, 
                                 formatted_stage_2: pd.DataFrame) -> pd.DataFrame:                            
    """Combine results tables from stage 1 and 2 into a formatted
    LaTeX table for manuscript.
    """
    combined_results = pd.concat([formatted_stage_1, formatted_stage_2])
    combined_results = combined_results.reset_index()
    combined_results = combined_results.set_index(["Study Stage", "metric"])
    return combined_results

3. Run experiments with both models#

Here we run a batch of experiments that vary the capacity of the CCU beds between 23 and 28. We run multiple experiments to test that the models produce the same results when parameters are updated.

3.1 Stage 1 results#

def run_stage_1_batch_experiments():
    """Script to vary capacity between 23 and 28
    in the stage 1 model.
    """
    experiments = get_experiments()
    
    # 2. run all experiments in a batch (5 replications each)
    all_results = run_all_experiments(experiments, 5)
    
    # 3. display results summary (2dp)
    summary = summary_of_experiments(all_results).round(2)
    
    # show results
    return summary

results_stage_1 = run_stage_1_batch_experiments()

Running experiment: Experiment_23
Running experiment: Experiment_24

Running experiment: Experiment_25
Running experiment: Experiment_26

Running experiment: Experiment_27
Running experiment: Experiment_28

3.2 Stage 2 results#

def run_stage_2_batch_experiments():
    """Script to vary capacity between 23 and 28
    in the stage 2 model.
    """
    experiments_stage_2 = get_experimentsS2()

    # 2. run all experiments in a batch (5 replications each)
    all_results_stage_2 = run_all_experimentsS2(experiments_stage_2, 5)
    
    # 3. display results summary (2dp)
    summary_stage_2 = summary_of_experimentsS2(all_results_stage_2).round(2)
    
    # show results
    return summary_stage_2

results_stage_2 = run_stage_2_batch_experiments()

Running Experiment with 23 beds
Running Experiment with 24 beds

Running Experiment with 25 beds
Running Experiment with 26 beds

Running Experiment with 27 beds
Running Experiment with 28 beds

3. Format and combine results into a single table#

To summarise the results we combine the experiments from the two models into a single summary table. Each column shows the mean (standard deviation) for a different number of beds between 23 and 28

formatted_stage_1 = format_stage_table(results_stage_1, "Stage 1")
formatted_stage_2 = format_stage_table(results_stage_2, "Stage 2")
combined_results = create_formatted_latex_table(formatted_stage_1,
                                                formatted_stage_2)
combined_results

		23 beds	24 beds	25 beds	26 beds	27 beds	28 beds
Study Stage	metric
Stage 1	0. Patient Count	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)
	1. Cancelled Elective Operations	390.6 (30.57)	337.8 (38.75)	279.0 (39.13)	231.4 (33.83)	178.4 (32.46)	139.8 (27.58)
	2. Bed Utilization	0.9 (0.02)	0.9 (0.02)	0.9 (0.02)	0.9 (0.02)	0.8 (0.02)	0.8 (0.02)
	3. Bed Occupancy	21.3 (0.49)	21.8 (0.50)	22.3 (0.54)	22.6 (0.56)	23.0 (0.58)	23.3 (0.62)
	4. Mean Unplanned Admission Waiting Time (hours)	103.8 (72.08)	62.5 (55.23)	35.0 (29.28)	20.8 (15.59)	12.0 (7.66)	7.0 (3.76)
Stage 2	0. Patient Count	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)	1,650.4 (17.83)
	1. Cancelled Elective Operations	390.6 (30.57)	337.8 (38.75)	279.0 (39.13)	231.4 (33.83)	178.4 (32.46)	139.8 (27.58)
	2. Bed Utilization	0.9 (0.02)	0.9 (0.02)	0.9 (0.02)	0.9 (0.02)	0.8 (0.02)	0.8 (0.02)
	3. Bed Occupancy	21.3 (0.49)	21.8 (0.50)	22.3 (0.54)	22.6 (0.56)	23.0 (0.58)	23.3 (0.62)
	4. Mean Unplanned Admission Waiting Time (hours)	103.8 (72.08)	62.5 (55.23)	35.0 (29.28)	20.8 (15.59)	12.0 (7.66)	7.0 (3.76)

# save results to file to show in results reporting.
combined_results.to_csv("../04_results/01_CCU/ccu_model_comparison.csv", 
                        index=True)

4. Output formatted results to LaTeX#

print(combined_results.T.style.to_latex(
    hrules=True, 
    label="Table:1", 
    caption="Comparison of critical care model outputs: stage 1 versus stage 2 (internal replication). Figures are mean (sd).")
     )

\begin{table}
\caption{Comparison of critical care model outputs: stage 1 versus stage 2 (internal replication). Figures are mean (sd).}
\label{Table:1}
\begin{tabular}{lllllllllll}
\toprule
Study Stage & \multicolumn{5}{r}{Stage 1} & \multicolumn{5}{r}{Stage 2} \\
metric & 0. Patient Count & 1. Cancelled Elective Operations & 2. Bed Utilization & 3. Bed Occupancy & 4. Mean Unplanned Admission Waiting Time (hours) & 0. Patient Count & 1. Cancelled Elective Operations & 2. Bed Utilization & 3. Bed Occupancy & 4. Mean Unplanned Admission Waiting Time (hours) \\
\midrule
23 beds & 1,650.4 (17.83) & 390.6 (30.57) & 0.9 (0.02) & 21.3 (0.49) & 103.8 (72.08) & 1,650.4 (17.83) & 390.6 (30.57) & 0.9 (0.02) & 21.3 (0.49) & 103.8 (72.08) \\
24 beds & 1,650.4 (17.83) & 337.8 (38.75) & 0.9 (0.02) & 21.8 (0.50) & 62.5 (55.23) & 1,650.4 (17.83) & 337.8 (38.75) & 0.9 (0.02) & 21.8 (0.50) & 62.5 (55.23) \\
25 beds & 1,650.4 (17.83) & 279.0 (39.13) & 0.9 (0.02) & 22.3 (0.54) & 35.0 (29.28) & 1,650.4 (17.83) & 279.0 (39.13) & 0.9 (0.02) & 22.3 (0.54) & 35.0 (29.28) \\
26 beds & 1,650.4 (17.83) & 231.4 (33.83) & 0.9 (0.02) & 22.6 (0.56) & 20.8 (15.59) & 1,650.4 (17.83) & 231.4 (33.83) & 0.9 (0.02) & 22.6 (0.56) & 20.8 (15.59) \\
27 beds & 1,650.4 (17.83) & 178.4 (32.46) & 0.8 (0.02) & 23.0 (0.58) & 12.0 (7.66) & 1,650.4 (17.83) & 178.4 (32.46) & 0.8 (0.02) & 23.0 (0.58) & 12.0 (7.66) \\
28 beds & 1,650.4 (17.83) & 139.8 (27.58) & 0.8 (0.02) & 23.3 (0.62) & 7.0 (3.76) & 1,650.4 (17.83) & 139.8 (27.58) & 0.8 (0.02) & 23.3 (0.62) & 7.0 (3.76) \\
\bottomrule
\end{tabular}
\end{table}

5. Output to Markdown#

print(combined_results.reset_index().to_markdown(index=False))

| Study Stage   | metric                                           | 23 beds         | 24 beds         | 25 beds         | 26 beds         | 27 beds         | 28 beds         |
|:--------------|:-------------------------------------------------|:----------------|:----------------|:----------------|:----------------|:----------------|:----------------|
| Stage 1       | 0. Patient Count                                 | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) |
| Stage 1       | 1. Cancelled Elective Operations                 | 390.6 (30.57)   | 337.8 (38.75)   | 279.0 (39.13)   | 231.4 (33.83)   | 178.4 (32.46)   | 139.8 (27.58)   |
| Stage 1       | 2. Bed Utilization                               | 0.9 (0.02)      | 0.9 (0.02)      | 0.9 (0.02)      | 0.9 (0.02)      | 0.8 (0.02)      | 0.8 (0.02)      |
| Stage 1       | 3. Bed Occupancy                                 | 21.3 (0.49)     | 21.8 (0.50)     | 22.3 (0.54)     | 22.6 (0.56)     | 23.0 (0.58)     | 23.3 (0.62)     |
| Stage 1       | 4. Mean Unplanned Admission Waiting Time (hours) | 103.8 (72.08)   | 62.5 (55.23)    | 35.0 (29.28)    | 20.8 (15.59)    | 12.0 (7.66)     | 7.0 (3.76)      |
| Stage 2       | 0. Patient Count                                 | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) | 1,650.4 (17.83) |
| Stage 2       | 1. Cancelled Elective Operations                 | 390.6 (30.57)   | 337.8 (38.75)   | 279.0 (39.13)   | 231.4 (33.83)   | 178.4 (32.46)   | 139.8 (27.58)   |
| Stage 2       | 2. Bed Utilization                               | 0.9 (0.02)      | 0.9 (0.02)      | 0.9 (0.02)      | 0.9 (0.02)      | 0.8 (0.02)      | 0.8 (0.02)      |
| Stage 2       | 3. Bed Occupancy                                 | 21.3 (0.49)     | 21.8 (0.50)     | 22.3 (0.54)     | 22.6 (0.56)     | 23.0 (0.58)     | 23.3 (0.62)     |
| Stage 2       | 4. Mean Unplanned Admission Waiting Time (hours) | 103.8 (72.08)   | 62.5 (55.23)    | 35.0 (29.28)    | 20.8 (15.59)    | 12.0 (7.66)     | 7.0 (3.76)      |

6. Statistical tests#

Analysis functions#

def extract_means_from_combined_results(combined_results, metric_name):
    """
    Extract mean values for a specific metric from both stages.
    Returns arrays of means for Stage 1 and Stage 2 across bed capacities.

    Parameters
    ----------
    combined_results: pd.DataFrame
        Dataframe with results from stages 1 and 2, with columns for number of
        beds, and rows for metrics for each stage.
    metric_name: str
        Name of metric in combined_results (e.g., "1. Cancelled Elective
        Operations").

    Returns
    -------
    tuple[np.ndarray, np.ndarray]
        Two numpy arrays containing:
        - stage_1_means: Mean values from Stage 1 across bed capacities
        - stage_2_means: Mean values from Stage 2 across bed capacities
        Arrays have identical length corresponding to bed capacities 23-28.
    """
    # Filter for the specific metric
    metric_data = combined_results.loc[(slice(None), metric_name), :]
    
    stage_1_means = []
    stage_2_means = []
    
    bed_capacities = [f'{i} beds' for i in range(23, 29)]  # 23-28 beds
    
    for bed_cap in bed_capacities:
        if bed_cap in metric_data.columns:
            # Extract numeric values from "mean (std)" format
            stage_1_val = metric_data.loc[('Stage 1', metric_name), bed_cap]
            stage_2_val = metric_data.loc[('Stage 2', metric_name), bed_cap]
            
            # Parse the mean from "mean (std)" format
            stage_1_mean = float(stage_1_val.split(' (')[0].replace(',', ''))
            stage_2_mean = float(stage_2_val.split(' (')[0].replace(',', ''))
            
            stage_1_means.append(stage_1_mean)
            stage_2_means.append(stage_2_mean)
    
    return np.array(stage_1_means), np.array(stage_2_means)

def perform_statistical_comparison(combined_results, metrics_to_test=None):
    """
    Perform paired t-tests and calculate confidence intervals for model
    comparison.

    Parameters
    ----------
    combined_results: pd.DataFrame
        Dataframe with results from stages 1 and 2, with columns for number of
        beds, and rows for metrics for each stage.
     metrics_to_test : list[str], optional
        Specific metrics to analyse. If None, tests all metrics found in
        the combined_results dataframe. Default is None.

    Returns
    -------
    pd.DataFrame
        Statistical analysis results containing columns:
        - Metric: Name of the performance metric tested
        - Mean_Difference: Average difference between Stage 1 and Stage 2
        - T_Statistic: Paired t-test statistic
        - P_Value: Statistical significance (p-value)
        - CI_Lower/CI_Upper: 95% confidence interval bounds for mean difference
        - Statistically_Significant: Boolean indicating significance (p < 0.05)
        - Interpretation: Text interpretation of statistical result
    """
    if metrics_to_test is None:
        # Get all metrics from the combined results
        metrics_to_test = (
            combined_results.index.get_level_values('metric').unique())
    
    statistical_results = []
    
    for metric in metrics_to_test:
        try:
            # Extract means for this metric
            stage_1_means, stage_2_means = extract_means_from_combined_results(
                combined_results, metric
            )
            
            # Perform paired t-test
            t_stat, p_value = stats.ttest_rel(stage_1_means, stage_2_means)
            
            # Calculate differences and confidence interval
            differences = stage_1_means - stage_2_means
            mean_diff = np.mean(differences)
            sem_diff = stats.sem(differences)
            
            # 95% confidence interval for the difference
            confidence_level = 0.95
            degrees_freedom = len(differences) - 1
            t_critical = stats.t.ppf(
                (1 + confidence_level) / 2, degrees_freedom)
            margin_of_error = t_critical * sem_diff
            
            ci_lower = mean_diff - margin_of_error
            ci_upper = mean_diff + margin_of_error
            
            # Determine significance
            is_significant = p_value < 0.05
            
            statistical_results.append({
                'Metric': metric,
                'Mean_Difference': mean_diff,
                'T_Statistic': t_stat,
                'P_Value': p_value,
                'CI_Lower': ci_lower,
                'CI_Upper': ci_upper,
                'Statistically_Significant': is_significant,
                'Interpretation': ('Significant difference' if is_significant
                                   else 'No significant difference')
            })
            
        except Exception as e:
            print(f"Error processing metric '{metric}': {e}")
            continue
    
    return pd.DataFrame(statistical_results)

def create_statistical_summary_table(statistical_results):
    """
    Create a formatted table showing statistical test results.

    Parameters
    ----------
    statistical_results : pd.DataFrame
        Raw statistical results from perform_statistical_comparison() containing
        numerical test statistics, p-values, and confidence intervals.
    
    Returns
    -------
    pd.DataFrame
        Formatted summary table with:
        - Properly formatted numerical values (appropriate decimal places)
        - Combined confidence interval column in "(lower, upper)" format
        - Clear column headers suitable for academic publication
        - Ordered columns for logical reading flow
    """
    summary = statistical_results.copy()
    
    # Format numerical columns
    summary['Mean_Difference'] = summary['Mean_Difference'].map('{:.3f}'.format)
    summary['T_Statistic'] = summary['T_Statistic'].map('{:.3f}'.format)
    summary['P_Value'] = summary['P_Value'].map('{:.4f}'.format)
    summary['CI_95%'] = summary.apply(
        lambda row: f"({row['CI_Lower']:.3f}, {row['CI_Upper']:.3f})", axis=1
    )
    
    # Select and reorder columns for display
    display_columns = ['Metric', 'Mean_Difference', 'T_Statistic', 'P_Value', 
                      'CI_95%', 'Statistically_Significant', 'Interpretation']
    
    return summary[display_columns]

Statistical comparison of stage 1 and stage 2 models#

# Run statistical tests
statistical_results = perform_statistical_comparison(combined_results)

# Create formatted summary
statistical_summary = create_statistical_summary_table(statistical_results)

# Display results
display(statistical_summary)

# Save statistical results
statistical_summary.to_csv(
    "../04_results/01_CCU/statistical_comparison.csv", index=False)

	Metric	T_Statistic	P_Value	CI_95%	Statistically_Significant	Interpretation
0	0. Patient Count	nan	nan	(0.000, 0.000)	False	No significant difference
1	1. Cancelled Elective Operations	nan	nan	(0.000, 0.000)	False	No significant difference
2	2. Bed Utilization	nan	nan	(0.000, 0.000)	False	No significant difference
3	3. Bed Occupancy	nan	nan	(0.000, 0.000)	False	No significant difference
4	4. Mean Unplanned Admission Waiting Time (hours)	nan	nan	(0.000, 0.000)	False	No significant difference

Generate LaTeX table for statistical results#

latex_table = statistical_summary.to_latex(
    index=False,
    escape=False,
    caption=("Statistical comparison of Stage 1 vs Stage 2 model outputs " +
             "using paired t-tests. CI = 95% confidence interval for mean " +
             "difference."),
    label="tab:statistical_comparison"
)
print(latex_table)

\begin{table}
\caption{Statistical comparison of Stage 1 vs Stage 2 model outputs using paired t-tests. CI = 95% confidence interval for mean difference.}
\label{tab:statistical_comparison}
\begin{tabular}{lllllrl}
\toprule
Metric & Mean_Difference & T_Statistic & P_Value & CI_95% & Statistically_Significant & Interpretation \\
\midrule
0. Patient Count & 0.000 & nan & nan & (0.000, 0.000) & False & No significant difference \\
1. Cancelled Elective Operations & 0.000 & nan & nan & (0.000, 0.000) & False & No significant difference \\
2. Bed Utilization & 0.000 & nan & nan & (0.000, 0.000) & False & No significant difference \\
3. Bed Occupancy & 0.000 & nan & nan & (0.000, 0.000) & False & No significant difference \\
4. Mean Unplanned Admission Waiting Time (hours) & 0.000 & nan & nan & (0.000, 0.000) & False & No significant difference \\
\bottomrule
\end{tabular}
\end{table}