Files
sheq-analysis-tool/analysis.py
T
2026-04-20 15:23:18 +12:00

695 lines
30 KiB
Python

"""
SHEQ Incident Analysis Engine
Generates charts and a DOCX report comparing two Project Director periods.
Usage:
from analysis import run_analysis
run_analysis("All_Events__5_.xlsx", "2024-01-01", "2025-04-01",
"Matthew Arthur", "Manga", output_dir="output")
"""
import os
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from docx import Document
from docx.shared import Inches, Pt, Cm, RGBColor, Emu
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn, nsdecls
from docx.oxml import parse_xml
from io import BytesIO
# ── Brand Colours (see DESIGN.md) ──
# Primary
DEEP_BLUE = RGBColor(0x0B, 0x32, 0x54)
SKY_BLUE = RGBColor(0x13, 0xB5, 0xEA)
# Secondary
DARK_GREEN = RGBColor(0x00, 0x6E, 0x47)
MID_GREEN = RGBColor(0x00, 0x99, 0x46)
LIGHT_GREEN = RGBColor(0x7B, 0xC1, 0x43)
PURPLE = RGBColor(0x96, 0x35, 0x8D)
# Functional
GREY = RGBColor(0x64, 0x74, 0x8B)
# Aliases used throughout
NAVY = DEEP_BLUE
TEAL = SKY_BLUE
GREEN = DARK_GREEN
# Hex versions for matplotlib
DEEP_BLUE_HEX = "#0b3254"
SKY_BLUE_HEX = "#13b5ea"
DARK_GREEN_HEX = "#006e47"
MID_GREEN_HEX = "#009946"
LIGHT_GREEN_HEX = "#7bc143"
PURPLE_HEX = "#96358d"
AMBER_HEX = "#d97706"
RED_HEX = "#dc2626"
# Chart palette sequence per DESIGN.md
CHART_PALETTE = [DEEP_BLUE_HEX, SKY_BLUE_HEX, DARK_GREEN_HEX, MID_GREEN_HEX,
LIGHT_GREEN_HEX, PURPLE_HEX, AMBER_HEX, RED_HEX]
# PD comparison colours
MA_HEX = DEEP_BLUE_HEX # PD1 = Deep Blue
MG_HEX = SKY_BLUE_HEX # PD2 = Sky Blue
# ═══════════════════════════════════════════════
# DATA LOADING & PREPARATION
# ═══════════════════════════════════════════════
def load_and_prepare(filepath, start_date, split_date):
"""Load Excel, filter by date range, add PD column."""
df = pd.read_excel(filepath)
df["Event Date"] = pd.to_datetime(df["Event Date"])
df = df[df["Event Date"] >= pd.Timestamp(start_date)].copy()
df["Year"] = df["Event Date"].dt.year
df["Month"] = df["Event Date"].dt.month
df["MonthName"] = df["Event Date"].dt.strftime("%b")
df["DOW"] = df["Event Date"].dt.day_name()
df["YearMonth"] = df["Event Date"].dt.to_period("M")
df["PD"] = df["Event Date"].apply(
lambda x: "pd1" if x < pd.Timestamp(split_date) else "pd2"
)
return df
def get_body_parts(series):
"""Split multi-value body part entries and normalise."""
parts = []
for val in series.dropna():
for part in str(val).split(","):
part = part.strip()
if part and "unspecified" not in part.lower():
parts.append(part)
return pd.Series(parts)
# ═══════════════════════════════════════════════
# CHART GENERATION
# ═══════════════════════════════════════════════
def _save(fig, path):
fig.tight_layout()
fig.savefig(path, dpi=200, bbox_inches="tight", facecolor="white")
plt.close(fig)
def _setup_chart_style():
"""Configure matplotlib to use Source Sans Pro if available."""
import matplotlib.font_manager as fm
available = [f.name for f in fm.fontManager.ttflist]
if "Source Sans Pro" in available:
plt.rcParams["font.family"] = "Source Sans Pro"
elif "Source Sans 3" in available:
plt.rcParams["font.family"] = "Source Sans 3"
else:
plt.rcParams["font.family"] = "sans-serif"
def generate_charts(df, pd1_name, pd2_name, split_date, output_dir):
"""Generate all comparison charts, return dict of paths."""
_setup_chart_style()
charts = {}
pd1 = df[df["PD"] == "pd1"]
pd2 = df[df["PD"] == "pd2"]
# Consequence severity colours per DESIGN.md
CONS_COLORS = [DARK_GREEN_HEX, AMBER_HEX, RED_HEX, PURPLE_HEX]
# 1. Monthly trend by PD
fig, ax = plt.subplots(figsize=(10, 4))
start_period = df["Event Date"].min().to_period("M")
end_period = df["Event Date"].max().to_period("M")
months_all = pd.period_range(start_period, end_period, freq="M")
monthly = df.groupby(["YearMonth", "PD"]).size().unstack(fill_value=0).reindex(months_all, fill_value=0)
x = range(len(months_all))
labels = [m.strftime("%b %y") for m in months_all]
ma_vals = monthly.get("pd1", pd.Series(0, index=months_all)).values
mg_vals = monthly.get("pd2", pd.Series(0, index=months_all)).values
ax.bar(x, ma_vals, color=MA_HEX, label=pd1_name, width=0.7, alpha=0.9)
ax.bar(x, mg_vals, bottom=ma_vals, color=MG_HEX, label=pd2_name, width=0.7, alpha=0.9)
split_m = pd.Timestamp(split_date).to_period("M")
if split_m in months_all:
trans_idx = list(months_all).index(split_m)
ax.axvline(x=trans_idx - 0.5, color=RED_HEX, linestyle="--", linewidth=1.5, alpha=0.7)
ax.text(trans_idx - 0.3, max(max(ma_vals + mg_vals), 1) * 0.95, "PD Transition",
fontsize=9, color=RED_HEX, ha="left")
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=8)
ax.set_title("Monthly Events by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
ax.set_ylabel("Events")
ax.legend(loc="upper right")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
p = os.path.join(output_dir, "monthly_by_pd.png")
_save(fig, p)
charts["monthly_by_pd"] = p
# 2. Event type comparison
evt_types = df["Event Type"].value_counts().index[:8]
ma_evt = pd1["Event Type"].value_counts().reindex(evt_types, fill_value=0)
mg_evt = pd2["Event Type"].value_counts().reindex(evt_types, fill_value=0)
fig, ax = plt.subplots(figsize=(9, 5))
y = np.arange(len(evt_types))
h = 0.35
ax.barh(y - h / 2, ma_evt.values, h, label=pd1_name, color=MA_HEX)
ax.barh(y + h / 2, mg_evt.values, h, label=pd2_name, color=MG_HEX)
ax.set_yticks(y)
ax.set_yticklabels(evt_types, fontsize=10)
ax.invert_yaxis()
ax.set_title("Event Types by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
ax.legend()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
for i, (v1, v2) in enumerate(zip(ma_evt.values, mg_evt.values)):
ax.text(v1 + 0.2, i - h / 2, str(v1), va="center", fontsize=9, color=MA_HEX)
ax.text(v2 + 0.2, i + h / 2, str(v2), va="center", fontsize=9, color=MG_HEX)
p = os.path.join(output_dir, "event_type_by_pd.png")
_save(fig, p)
charts["event_type_by_pd"] = p
# 3. Consequence comparison (pie charts)
cons_order = ["Negligible", "Minor", "Moderate", "Major"]
fig, axes = plt.subplots(1, 2, figsize=(9, 3.5))
for ax, sub, title in zip(axes, [pd1, pd2], [pd1_name, pd2_name]):
data = sub["Actual Consequence"].value_counts().reindex(cons_order, fill_value=0)
ax.pie(data.values, labels=cons_order, autopct="%1.0f%%", colors=CONS_COLORS, startangle=140,
textprops={"fontsize": 9})
ax.set_title(title, fontsize=13, fontweight="bold", color=MA_HEX)
p = os.path.join(output_dir, "consequence_by_pd.png")
_save(fig, p)
charts["consequence_by_pd"] = p
# 4. Day of week
dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
fig, ax = plt.subplots(figsize=(9, 4))
x_arr = np.arange(len(dow_order))
w = 0.35
ma_d = pd1["DOW"].value_counts().reindex(dow_order, fill_value=0)
mg_d = pd2["DOW"].value_counts().reindex(dow_order, fill_value=0)
b1 = ax.bar(x_arr - w / 2, ma_d.values, w, label=pd1_name, color=MA_HEX)
b2 = ax.bar(x_arr + w / 2, mg_d.values, w, label=pd2_name, color=MG_HEX)
ax.set_xticks(x_arr)
ax.set_xticklabels([d[:3] for d in dow_order])
ax.set_title("Events by Day of Week", fontsize=14, fontweight="bold", color=MA_HEX)
ax.legend()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
for b in b1:
if b.get_height() > 0:
ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
ha="center", fontsize=9)
for b in b2:
if b.get_height() > 0:
ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
ha="center", fontsize=9)
p = os.path.join(output_dir, "dow_by_pd.png")
_save(fig, p)
charts["dow_by_pd"] = p
# 5. Root cause
rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
fig, ax = plt.subplots(figsize=(9, 4))
y = np.arange(len(rc_cats))
h = 0.35
ma_rc = pd1["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
mg_rc = pd2["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
ax.barh(y - h / 2, ma_rc.values, h, label=pd1_name, color=MA_HEX)
ax.barh(y + h / 2, mg_rc.values, h, label=pd2_name, color=MG_HEX)
ax.set_yticks(y)
ax.set_yticklabels(rc_cats, fontsize=10)
ax.invert_yaxis()
ax.set_title("Root Cause Categories by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
ax.legend()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
p = os.path.join(output_dir, "rootcause_by_pd.png")
_save(fig, p)
charts["rootcause_by_pd"] = p
# 6. CRP comparison
crp_all = df["CRPInvolved"].value_counts()
crp_active = crp_all[~crp_all.index.isin(["None Identified", "Under Investigation"])].head(8)
crp_cats = crp_active.index
fig, ax = plt.subplots(figsize=(9, 4.5))
y = np.arange(len(crp_cats))
ma_c = pd1["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
mg_c = pd2["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
ax.barh(y - h / 2, ma_c.values, h, label=pd1_name, color=MA_HEX)
ax.barh(y + h / 2, mg_c.values, h, label=pd2_name, color=MG_HEX)
ax.set_yticks(y)
ax.set_yticklabels(crp_cats, fontsize=9)
ax.invert_yaxis()
ax.set_title("Critical Risk Protocols by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
ax.legend()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
p = os.path.join(output_dir, "crp_by_pd.png")
_save(fig, p)
charts["crp_by_pd"] = p
# 7. Body parts
bp_series = get_body_parts(df["Bodily Location"])
if len(bp_series) > 0:
bp_top = bp_series.value_counts().head(10)
fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(range(len(bp_top)), bp_top.values, color=DARK_GREEN_HEX)
ax.set_yticks(range(len(bp_top)))
ax.set_yticklabels(bp_top.index, fontsize=10)
ax.invert_yaxis()
for i, v in enumerate(bp_top.values):
ax.text(v + 0.1, i, str(v), va="center", fontsize=11, fontweight="bold")
ax.set_title("Top Injured Body Parts", fontsize=14, fontweight="bold", color=MA_HEX)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
p = os.path.join(output_dir, "body_parts.png")
_save(fig, p)
charts["body_parts"] = p
return charts
# ═══════════════════════════════════════════════
# DOCX GENERATION
# ═══════════════════════════════════════════════
def _set_cell_shading(cell, color_hex):
"""Apply background shading to a table cell."""
shading = parse_xml(f'<w:shd {nsdecls("w")} w:fill="{color_hex}" w:val="clear"/>')
cell._tc.get_or_add_tcPr().append(shading)
def _add_styled_table(doc, headers, rows, col_widths_inches):
"""Add a formatted comparison table."""
table = doc.add_table(rows=1 + len(rows), cols=len(headers))
table.alignment = WD_TABLE_ALIGNMENT.LEFT
table.style = "Table Grid"
# Header row
for i, h in enumerate(headers):
cell = table.rows[0].cells[i]
cell.text = ""
p = cell.paragraphs[0]
run = p.add_run(h)
run.bold = True
run.font.size = Pt(9)
run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
run.font.name = "Source Sans Pro"
_set_cell_shading(cell, "0b3254")
# Data rows
for ri, row in enumerate(rows):
for ci, val in enumerate(row):
cell = table.rows[ri + 1].cells[ci]
cell.text = ""
p = cell.paragraphs[0]
run = p.add_run(str(val))
run.font.size = Pt(9)
run.font.name = "Source Sans Pro"
bg = "F0F5FA" if ri % 2 == 0 else "FFFFFF"
_set_cell_shading(cell, bg)
# Set column widths
for i, w in enumerate(col_widths_inches):
for row in table.rows:
row.cells[i].width = Inches(w)
return table
def generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir):
"""Generate the full DOCX report."""
doc = Document()
# Set default font
style = doc.styles["Normal"]
style.font.name = "Source Sans Pro"
style.font.size = Pt(11)
# Heading styles
for level, size, color in [(1, 16, NAVY), (2, 13, TEAL)]:
hs = doc.styles[f"Heading {level}"]
hs.font.name = "Source Sans Pro"
hs.font.size = Pt(size)
hs.font.color.rgb = color
hs.font.bold = True
pd1 = df[df["PD"] == "pd1"]
pd2 = df[df["PD"] == "pd2"]
total = len(df)
pd1_months = max(1, (pd.Timestamp(split_date) - df["Event Date"].min()).days / 30.44)
pd2_months = max(1, (df["Event Date"].max() - pd.Timestamp(split_date)).days / 30.44 + 1)
pd1_start = pd1["Event Date"].min().strftime("%b %Y") if len(pd1) > 0 else "N/A"
pd1_end = pd1["Event Date"].max().strftime("%b %Y") if len(pd1) > 0 else "N/A"
pd2_start = pd2["Event Date"].min().strftime("%b %Y") if len(pd2) > 0 else "N/A"
pd2_end = pd2["Event Date"].max().strftime("%b %Y") if len(pd2) > 0 else "N/A"
# ── Title page ──
doc.add_paragraph("")
doc.add_paragraph("")
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("SHEQ Incident Analysis")
run.font.size = Pt(28)
run.bold = True
run.font.name = "Source Sans Pro"
run.font.color.rgb = NAVY
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("Far North Waters Project")
run.font.size = Pt(16)
run.font.name = "Source Sans Pro"
run.font.color.rgb = TEAL
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(f"{pd1_start} \u2013 {pd2_end} (MTD)")
run.font.size = Pt(14)
run.font.name = "Source Sans Pro"
run.font.color.rgb = TEAL
doc.add_paragraph("")
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("Performance by Project Director")
run.font.size = Pt(13)
run.bold = True
run.font.name = "Source Sans Pro"
run.font.color.rgb = NAVY
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(f"{pd1_name} ")
run.bold = True
run.font.color.rgb = NAVY
run = p.add_run(f"({pd1_start} \u2013 {pd1_end}) | ")
run.font.color.rgb = GREY
run = p.add_run(f"{pd2_name} ")
run.bold = True
run.font.color.rgb = TEAL
run = p.add_run(f"({pd2_start} \u2013 {pd2_end})")
run.font.color.rgb = GREY
doc.add_paragraph("")
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("Ventia \u2022 Infrastructure Services \u2022 Water & Environmental Services")
run.font.size = Pt(10)
run.font.color.rgb = GREY
doc.add_page_break()
# ── Helper functions ──
def h1(text):
doc.add_heading(text, level=1)
def h2(text):
doc.add_heading(text, level=2)
def text(t, bold=False):
p = doc.add_paragraph()
run = p.add_run(t)
run.bold = bold
return p
def bullet(t):
p = doc.add_paragraph(t, style="List Bullet")
return p
def add_chart(name, width=5.5):
if name in charts:
doc.add_picture(charts[name], width=Inches(width))
# Helper for injury classification
def _inj_class(sub):
return sub["Ventia Injury Classification"].value_counts()
# ═══════════════════════════════════════════
# 1. EXECUTIVE SUMMARY
# ═══════════════════════════════════════════
h1("1. Executive Summary")
text(f"This report analyses {total} SHEQ events recorded for the Far North Waters project "
f"from {pd1_start} to {pd2_end} (month-to-date). The analysis is structured around "
f"two Project Director tenures to enable performance comparison:")
pd1_inj = pd1[pd1["Event Type"] == "Injury/Illness Sustained"]
pd2_inj = pd2[pd2["Event Type"] == "Injury/Illness Sustained"]
pd1_mv = pd1[pd1["Event Type"] == "Motor Vehicle"]
pd2_mv = pd2[pd2["Event Type"] == "Motor Vehicle"]
pd1_ic = _inj_class(pd1)
pd2_ic = _inj_class(pd2)
pd1_cc = len(pd1[pd1["Event Type"] == "Close Call"])
pd2_cc = len(pd2[pd2["Event Type"] == "Close Call"])
pd1_mod = len(pd1[pd1["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])
pd2_mod = len(pd2[pd2["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])
_add_styled_table(doc,
["", pd1_name, pd2_name],
[
["Period", f"{pd1_start} \u2013 {pd1_end}", f"{pd2_start} \u2013 {pd2_end}"],
["Duration", f"{pd1_months:.0f} months", f"{pd2_months:.0f} months"],
["Total Events", str(len(pd1)), str(len(pd2))],
["Events per Month", f"{len(pd1)/pd1_months:.1f}", f"{len(pd2)/pd2_months:.1f}"],
["Injuries", f"{len(pd1_inj)} ({len(pd1_inj)/max(len(pd1),1)*100:.1f}%)",
f"{len(pd2_inj)} ({len(pd2_inj)/max(len(pd2),1)*100:.1f}%)"],
["Motor Vehicle Events", f"{len(pd1_mv)} ({len(pd1_mv)/max(len(pd1),1)*100:.1f}%)",
f"{len(pd2_mv)} ({len(pd2_mv)/max(len(pd2),1)*100:.1f}%)"],
["Lost Time Injuries", str(pd1_ic.get("Lost Time Injury", 0)), str(pd2_ic.get("Lost Time Injury", 0))],
["First Aid Treatments", str(pd1_ic.get("First Aid Treatment", 0)), str(pd2_ic.get("First Aid Treatment", 0))],
["Close Calls", f"{pd1_cc} ({pd1_cc/max(len(pd1),1)*100:.1f}%)",
f"{pd2_cc} ({pd2_cc/max(len(pd2),1)*100:.1f}%)"],
["Moderate+ Consequence", f"{pd1_mod} ({pd1_mod/max(len(pd1),1)*100:.1f}%)",
f"{pd2_mod} ({pd2_mod/max(len(pd2),1)*100:.1f}%)"],
["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
f"{pd2['Days to Investigate'].dropna().median():.0f}"],
["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
f"{pd2['Days to Close'].dropna().median():.0f}"],
],
[2.0, 2.2, 2.3]
)
doc.add_paragraph("")
h2("Key Comparative Findings")
rate1 = len(pd1) / pd1_months
rate2 = len(pd2) / pd2_months
bullet(f"Event rate {'increased' if rate2 > rate1 else 'decreased'} under {pd2_name} "
f"({rate2:.1f}/month vs {rate1:.1f}/month), with Moderate+ consequences at "
f"{pd2_mod/max(len(pd2),1)*100:.1f}% vs {pd1_mod/max(len(pd1),1)*100:.1f}%.")
bullet(f"Motor vehicle events: {len(pd2_mv)} under {pd2_name} vs {len(pd1_mv)} under {pd1_name} "
f"({len(pd2_mv)/max(len(pd2),1)*100:.1f}% vs {len(pd1_mv)/max(len(pd1),1)*100:.1f}%).")
bullet(f"Close call reporting: {pd2_cc/max(len(pd2),1)*100:.1f}% under {pd2_name} vs "
f"{pd1_cc/max(len(pd1),1)*100:.1f}% under {pd1_name}.")
lti1 = pd1_ic.get("Lost Time Injury", 0)
lti2 = pd2_ic.get("Lost Time Injury", 0)
if lti2 > lti1:
bullet(f"{lti2} Lost Time Injuries under {pd2_name} compared to {lti1} under {pd1_name}.")
doc.add_page_break()
# ═══════════════════════════════════════════
# 2. MONTHLY TRENDS
# ═══════════════════════════════════════════
h1("2. Monthly Event Trends")
text("The chart below shows monthly event counts across both Project Director periods.")
add_chart("monthly_by_pd", 5.8)
doc.add_page_break()
# ═══════════════════════════════════════════
# 3. EVENT TYPE COMPARISON
# ═══════════════════════════════════════════
h1("3. Event Type Comparison")
add_chart("event_type_by_pd", 5.5)
evt_types = df["Event Type"].value_counts().index
evt_rows = []
for e in evt_types:
c1 = len(pd1[pd1["Event Type"] == e])
c2 = len(pd2[pd2["Event Type"] == e])
evt_rows.append([e, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
_add_styled_table(doc, ["Event Type", pd1_name, "%", pd2_name, "%"], evt_rows,
[2.0, 1.1, 0.8, 1.0, 0.8])
doc.add_paragraph("")
text("Notable shifts:", bold=True)
# Auto-detect biggest shifts
for e in evt_types:
c1 = len(pd1[pd1["Event Type"] == e])
c2 = len(pd2[pd2["Event Type"] == e])
pct1 = c1 / max(len(pd1), 1) * 100
pct2 = c2 / max(len(pd2), 1) * 100
if abs(pct2 - pct1) > 5:
direction = "increased" if pct2 > pct1 else "decreased"
bullet(f"{e} {direction}: {pct1:.1f}% \u2192 {pct2:.1f}% ({c1} \u2192 {c2} events).")
doc.add_page_break()
# ═══════════════════════════════════════════
# 4. INJURY ANALYSIS
# ═══════════════════════════════════════════
h1("4. Injury Analysis")
h2("4.1 Injury Classification")
inj_classes = ["First Aid Treatment", "Report Only", "Non-Work Related",
"Lost Time Injury", "Medical Treatment Injury"]
inj_rows = [[c, str(pd1_ic.get(c, 0)), str(pd2_ic.get(c, 0))] for c in inj_classes]
_add_styled_table(doc, ["Classification", pd1_name, pd2_name], inj_rows, [2.5, 1.8, 1.8])
h2("4.2 Body Parts Injured")
add_chart("body_parts", 5.0)
# Body part comparison
bp1 = get_body_parts(pd1["Bodily Location"]).value_counts().head(6)
bp2 = get_body_parts(pd2["Bodily Location"]).value_counts().head(6)
all_bp = list(dict.fromkeys(list(bp1.index) + list(bp2.index)))[:8]
bp_rows = [[bp, str(bp1.get(bp, 0)), str(bp2.get(bp, 0))] for bp in all_bp]
_add_styled_table(doc, ["Body Part", pd1_name, pd2_name], bp_rows, [2.5, 1.8, 1.8])
doc.add_page_break()
# ═══════════════════════════════════════════
# 5. CONSEQUENCE ANALYSIS
# ═══════════════════════════════════════════
h1("5. Consequence Analysis")
add_chart("consequence_by_pd", 5.5)
cons_order = ["Negligible", "Minor", "Moderate", "Major"]
cons_rows = []
for c in cons_order:
c1 = len(pd1[pd1["Actual Consequence"] == c])
c2 = len(pd2[pd2["Actual Consequence"] == c])
cons_rows.append([c, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
_add_styled_table(doc, ["Consequence", pd1_name, "%", pd2_name, "%"], cons_rows,
[1.5, 1.0, 0.8, 1.0, 0.8])
doc.add_page_break()
# ═══════════════════════════════════════════
# 6. CRP & ROOT CAUSE
# ═══════════════════════════════════════════
h1("6. Critical Risk Protocols & Root Causes")
h2("6.1 CRP Comparison")
add_chart("crp_by_pd", 5.5)
h2("6.2 Root Cause Comparison")
add_chart("rootcause_by_pd", 5.5)
rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
rc_rows = []
for r in rc_cats:
c1 = len(pd1[pd1["Root Cause Category"] == r])
c2 = len(pd2[pd2["Root Cause Category"] == r])
t1 = pd1["Root Cause Category"].notna().sum()
t2 = pd2["Root Cause Category"].notna().sum()
rc_rows.append([r, str(c1), f"{c1/max(t1,1)*100:.1f}%",
str(c2), f"{c2/max(t2,1)*100:.1f}%"])
_add_styled_table(doc, ["Root Cause", pd1_name, "%", pd2_name, "%"], rc_rows,
[2.0, 1.1, 0.8, 1.0, 0.8])
doc.add_page_break()
# ═══════════════════════════════════════════
# 7. TIMING PATTERNS
# ═══════════════════════════════════════════
h1("7. Timing Patterns")
add_chart("dow_by_pd", 5.5)
doc.add_page_break()
# ═══════════════════════════════════════════
# 8. INVESTIGATION PERFORMANCE
# ═══════════════════════════════════════════
h1("8. Investigation Performance")
inv_rows = [
["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
f"{pd2['Days to Investigate'].dropna().median():.0f}"],
["Mean Days to Investigate", f"{pd1['Days to Investigate'].dropna().mean():.1f}",
f"{pd2['Days to Investigate'].dropna().mean():.1f}"],
["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
f"{pd2['Days to Close'].dropna().median():.0f}"],
["Mean Days to Close", f"{pd1['Days to Close'].dropna().mean():.1f}",
f"{pd2['Days to Close'].dropna().mean():.1f}"],
["Events Closed", f"{(pd1['Status']=='Closed').sum()} ({(pd1['Status']=='Closed').sum()/max(len(pd1),1)*100:.0f}%)",
f"{(pd2['Status']=='Closed').sum()} ({(pd2['Status']=='Closed').sum()/max(len(pd2),1)*100:.0f}%)"],
["Events Open", str((pd1["Status"] == "Open").sum()), str((pd2["Status"] == "Open").sum())],
]
_add_styled_table(doc, ["Metric", pd1_name, pd2_name], inv_rows, [2.5, 1.8, 1.8])
doc.add_page_break()
# ═══════════════════════════════════════════
# 9. RECOMMENDATIONS
# ═══════════════════════════════════════════
h1("9. Key Findings & Recommendations")
h2(f"9.1 Areas Requiring Attention ({pd2_name} Period)")
if len(pd2_mv) > len(pd1_mv):
bullet("Motor vehicle events have increased \u2014 reinforce journey management plans and reversing protocols.")
if pd2_mod / max(len(pd2), 1) > pd1_mod / max(len(pd1), 1):
bullet("Moderate+ consequence events have increased \u2014 investigate whether controls are being bypassed.")
if pd2_cc / max(len(pd2), 1) < pd1_cc / max(len(pd1), 1):
bullet("Close call reporting has declined \u2014 implement reporting targets and recognise reporters.")
if lti2 > lti1:
bullet(f"{lti2} LTIs under {pd2_name} vs {lti1} under {pd1_name} \u2014 review circumstances and RTW processes.")
h2("9.2 Systemic Issues (Both Periods)")
bullet("Lower back injuries from manual handling at pump stations persist \u2014 engineering controls needed.")
bullet("Third Party/Public Liability events remain a large category, driven by aging infrastructure.")
bullet("Wednesday remains the peak risk day \u2014 consider targeted mid-week safety interventions.")
h2("9.3 Recommended Actions")
bullet("Set a close-call reporting KPI (minimum 10% of all events) and track monthly.")
bullet("Implement a motor vehicle safety campaign focusing on reversing and traffic management.")
bullet("Schedule quarterly PD safety performance reviews using this report format.")
# ── Save ──
output_path = os.path.join(output_dir, "SHEQ_PD_Comparison.docx")
doc.save(output_path)
return output_path
# ═══════════════════════════════════════════════
# MAIN ENTRY POINT
# ═══════════════════════════════════════════════
def run_analysis(filepath, start_date, split_date, pd1_name, pd2_name, output_dir="output"):
"""Run the full analysis pipeline."""
os.makedirs(output_dir, exist_ok=True)
print(f"Loading data from {filepath}...")
df = load_and_prepare(filepath, start_date, split_date)
print(f" {len(df)} events loaded ({df['Event Date'].min().date()} to {df['Event Date'].max().date()})")
print(f" {pd1_name}: {(df['PD']=='pd1').sum()} events")
print(f" {pd2_name}: {(df['PD']=='pd2').sum()} events")
print("Generating charts...")
charts = generate_charts(df, pd1_name, pd2_name, split_date, output_dir)
print(f" {len(charts)} charts created")
print("Generating DOCX report...")
docx_path = generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir)
print(f" Report saved to {docx_path}")
return docx_path
if __name__ == "__main__":
run_analysis(
filepath="All_Events__5_.xlsx",
start_date="2024-01-01",
split_date="2025-04-01",
pd1_name="Matthew Arthur",
pd2_name="Manga",
output_dir="output"
)