push

2026-04-20 15:23:18 +12:00
parent 47365f7c36
commit d32a861dfb
310 changed files with 5881 additions and 0 deletions
@@ -0,0 +1,694 @@
+"""
+SHEQ Incident Analysis Engine
+Generates charts and a DOCX report comparing two Project Director periods.
+
+Usage:
+    from analysis import run_analysis
+    run_analysis("All_Events__5_.xlsx", "2024-01-01", "2025-04-01",
+                 "Matthew Arthur", "Manga", output_dir="output")
+"""
+
+import os
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+from docx import Document
+from docx.shared import Inches, Pt, Cm, RGBColor, Emu
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.enum.table import WD_TABLE_ALIGNMENT
+from docx.oxml.ns import qn, nsdecls
+from docx.oxml import parse_xml
+from io import BytesIO
+
+
+# ── Brand Colours (see DESIGN.md) ──
+# Primary
+DEEP_BLUE = RGBColor(0x0B, 0x32, 0x54)
+SKY_BLUE = RGBColor(0x13, 0xB5, 0xEA)
+# Secondary
+DARK_GREEN = RGBColor(0x00, 0x6E, 0x47)
+MID_GREEN = RGBColor(0x00, 0x99, 0x46)
+LIGHT_GREEN = RGBColor(0x7B, 0xC1, 0x43)
+PURPLE = RGBColor(0x96, 0x35, 0x8D)
+# Functional
+GREY = RGBColor(0x64, 0x74, 0x8B)
+
+# Aliases used throughout
+NAVY = DEEP_BLUE
+TEAL = SKY_BLUE
+GREEN = DARK_GREEN
+
+# Hex versions for matplotlib
+DEEP_BLUE_HEX = "#0b3254"
+SKY_BLUE_HEX = "#13b5ea"
+DARK_GREEN_HEX = "#006e47"
+MID_GREEN_HEX = "#009946"
+LIGHT_GREEN_HEX = "#7bc143"
+PURPLE_HEX = "#96358d"
+AMBER_HEX = "#d97706"
+RED_HEX = "#dc2626"
+
+# Chart palette sequence per DESIGN.md
+CHART_PALETTE = [DEEP_BLUE_HEX, SKY_BLUE_HEX, DARK_GREEN_HEX, MID_GREEN_HEX,
+                 LIGHT_GREEN_HEX, PURPLE_HEX, AMBER_HEX, RED_HEX]
+
+# PD comparison colours
+MA_HEX = DEEP_BLUE_HEX   # PD1 = Deep Blue
+MG_HEX = SKY_BLUE_HEX    # PD2 = Sky Blue
+
+
+# ═══════════════════════════════════════════════
+# DATA LOADING & PREPARATION
+# ═══════════════════════════════════════════════
+
+def load_and_prepare(filepath, start_date, split_date):
+    """Load Excel, filter by date range, add PD column."""
+    df = pd.read_excel(filepath)
+    df["Event Date"] = pd.to_datetime(df["Event Date"])
+    df = df[df["Event Date"] >= pd.Timestamp(start_date)].copy()
+    df["Year"] = df["Event Date"].dt.year
+    df["Month"] = df["Event Date"].dt.month
+    df["MonthName"] = df["Event Date"].dt.strftime("%b")
+    df["DOW"] = df["Event Date"].dt.day_name()
+    df["YearMonth"] = df["Event Date"].dt.to_period("M")
+    df["PD"] = df["Event Date"].apply(
+        lambda x: "pd1" if x < pd.Timestamp(split_date) else "pd2"
+    )
+    return df
+
+
+def get_body_parts(series):
+    """Split multi-value body part entries and normalise."""
+    parts = []
+    for val in series.dropna():
+        for part in str(val).split(","):
+            part = part.strip()
+            if part and "unspecified" not in part.lower():
+                parts.append(part)
+    return pd.Series(parts)
+
+
+# ═══════════════════════════════════════════════
+# CHART GENERATION
+# ═══════════════════════════════════════════════
+
+def _save(fig, path):
+    fig.tight_layout()
+    fig.savefig(path, dpi=200, bbox_inches="tight", facecolor="white")
+    plt.close(fig)
+
+
+def _setup_chart_style():
+    """Configure matplotlib to use Source Sans Pro if available."""
+    import matplotlib.font_manager as fm
+    available = [f.name for f in fm.fontManager.ttflist]
+    if "Source Sans Pro" in available:
+        plt.rcParams["font.family"] = "Source Sans Pro"
+    elif "Source Sans 3" in available:
+        plt.rcParams["font.family"] = "Source Sans 3"
+    else:
+        plt.rcParams["font.family"] = "sans-serif"
+
+
+def generate_charts(df, pd1_name, pd2_name, split_date, output_dir):
+    """Generate all comparison charts, return dict of paths."""
+    _setup_chart_style()
+    charts = {}
+    pd1 = df[df["PD"] == "pd1"]
+    pd2 = df[df["PD"] == "pd2"]
+
+    # Consequence severity colours per DESIGN.md
+    CONS_COLORS = [DARK_GREEN_HEX, AMBER_HEX, RED_HEX, PURPLE_HEX]
+
+    # 1. Monthly trend by PD
+    fig, ax = plt.subplots(figsize=(10, 4))
+    start_period = df["Event Date"].min().to_period("M")
+    end_period = df["Event Date"].max().to_period("M")
+    months_all = pd.period_range(start_period, end_period, freq="M")
+    monthly = df.groupby(["YearMonth", "PD"]).size().unstack(fill_value=0).reindex(months_all, fill_value=0)
+    x = range(len(months_all))
+    labels = [m.strftime("%b %y") for m in months_all]
+    ma_vals = monthly.get("pd1", pd.Series(0, index=months_all)).values
+    mg_vals = monthly.get("pd2", pd.Series(0, index=months_all)).values
+    ax.bar(x, ma_vals, color=MA_HEX, label=pd1_name, width=0.7, alpha=0.9)
+    ax.bar(x, mg_vals, bottom=ma_vals, color=MG_HEX, label=pd2_name, width=0.7, alpha=0.9)
+    split_m = pd.Timestamp(split_date).to_period("M")
+    if split_m in months_all:
+        trans_idx = list(months_all).index(split_m)
+        ax.axvline(x=trans_idx - 0.5, color=RED_HEX, linestyle="--", linewidth=1.5, alpha=0.7)
+        ax.text(trans_idx - 0.3, max(max(ma_vals + mg_vals), 1) * 0.95, "PD Transition",
+                fontsize=9, color=RED_HEX, ha="left")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=8)
+    ax.set_title("Monthly Events by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
+    ax.set_ylabel("Events")
+    ax.legend(loc="upper right")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    p = os.path.join(output_dir, "monthly_by_pd.png")
+    _save(fig, p)
+    charts["monthly_by_pd"] = p
+
+    # 2. Event type comparison
+    evt_types = df["Event Type"].value_counts().index[:8]
+    ma_evt = pd1["Event Type"].value_counts().reindex(evt_types, fill_value=0)
+    mg_evt = pd2["Event Type"].value_counts().reindex(evt_types, fill_value=0)
+    fig, ax = plt.subplots(figsize=(9, 5))
+    y = np.arange(len(evt_types))
+    h = 0.35
+    ax.barh(y - h / 2, ma_evt.values, h, label=pd1_name, color=MA_HEX)
+    ax.barh(y + h / 2, mg_evt.values, h, label=pd2_name, color=MG_HEX)
+    ax.set_yticks(y)
+    ax.set_yticklabels(evt_types, fontsize=10)
+    ax.invert_yaxis()
+    ax.set_title("Event Types by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
+    ax.legend()
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for i, (v1, v2) in enumerate(zip(ma_evt.values, mg_evt.values)):
+        ax.text(v1 + 0.2, i - h / 2, str(v1), va="center", fontsize=9, color=MA_HEX)
+        ax.text(v2 + 0.2, i + h / 2, str(v2), va="center", fontsize=9, color=MG_HEX)
+    p = os.path.join(output_dir, "event_type_by_pd.png")
+    _save(fig, p)
+    charts["event_type_by_pd"] = p
+
+    # 3. Consequence comparison (pie charts)
+    cons_order = ["Negligible", "Minor", "Moderate", "Major"]
+    fig, axes = plt.subplots(1, 2, figsize=(9, 3.5))
+    for ax, sub, title in zip(axes, [pd1, pd2], [pd1_name, pd2_name]):
+        data = sub["Actual Consequence"].value_counts().reindex(cons_order, fill_value=0)
+        ax.pie(data.values, labels=cons_order, autopct="%1.0f%%", colors=CONS_COLORS, startangle=140,
+               textprops={"fontsize": 9})
+        ax.set_title(title, fontsize=13, fontweight="bold", color=MA_HEX)
+    p = os.path.join(output_dir, "consequence_by_pd.png")
+    _save(fig, p)
+    charts["consequence_by_pd"] = p
+
+    # 4. Day of week
+    dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+    fig, ax = plt.subplots(figsize=(9, 4))
+    x_arr = np.arange(len(dow_order))
+    w = 0.35
+    ma_d = pd1["DOW"].value_counts().reindex(dow_order, fill_value=0)
+    mg_d = pd2["DOW"].value_counts().reindex(dow_order, fill_value=0)
+    b1 = ax.bar(x_arr - w / 2, ma_d.values, w, label=pd1_name, color=MA_HEX)
+    b2 = ax.bar(x_arr + w / 2, mg_d.values, w, label=pd2_name, color=MG_HEX)
+    ax.set_xticks(x_arr)
+    ax.set_xticklabels([d[:3] for d in dow_order])
+    ax.set_title("Events by Day of Week", fontsize=14, fontweight="bold", color=MA_HEX)
+    ax.legend()
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for b in b1:
+        if b.get_height() > 0:
+            ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
+                    ha="center", fontsize=9)
+    for b in b2:
+        if b.get_height() > 0:
+            ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
+                    ha="center", fontsize=9)
+    p = os.path.join(output_dir, "dow_by_pd.png")
+    _save(fig, p)
+    charts["dow_by_pd"] = p
+
+    # 5. Root cause
+    rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
+    fig, ax = plt.subplots(figsize=(9, 4))
+    y = np.arange(len(rc_cats))
+    h = 0.35
+    ma_rc = pd1["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
+    mg_rc = pd2["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
+    ax.barh(y - h / 2, ma_rc.values, h, label=pd1_name, color=MA_HEX)
+    ax.barh(y + h / 2, mg_rc.values, h, label=pd2_name, color=MG_HEX)
+    ax.set_yticks(y)
+    ax.set_yticklabels(rc_cats, fontsize=10)
+    ax.invert_yaxis()
+    ax.set_title("Root Cause Categories by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
+    ax.legend()
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    p = os.path.join(output_dir, "rootcause_by_pd.png")
+    _save(fig, p)
+    charts["rootcause_by_pd"] = p
+
+    # 6. CRP comparison
+    crp_all = df["CRPInvolved"].value_counts()
+    crp_active = crp_all[~crp_all.index.isin(["None Identified", "Under Investigation"])].head(8)
+    crp_cats = crp_active.index
+    fig, ax = plt.subplots(figsize=(9, 4.5))
+    y = np.arange(len(crp_cats))
+    ma_c = pd1["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
+    mg_c = pd2["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
+    ax.barh(y - h / 2, ma_c.values, h, label=pd1_name, color=MA_HEX)
+    ax.barh(y + h / 2, mg_c.values, h, label=pd2_name, color=MG_HEX)
+    ax.set_yticks(y)
+    ax.set_yticklabels(crp_cats, fontsize=9)
+    ax.invert_yaxis()
+    ax.set_title("Critical Risk Protocols by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
+    ax.legend()
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    p = os.path.join(output_dir, "crp_by_pd.png")
+    _save(fig, p)
+    charts["crp_by_pd"] = p
+
+    # 7. Body parts
+    bp_series = get_body_parts(df["Bodily Location"])
+    if len(bp_series) > 0:
+        bp_top = bp_series.value_counts().head(10)
+        fig, ax = plt.subplots(figsize=(8, 4))
+        ax.barh(range(len(bp_top)), bp_top.values, color=DARK_GREEN_HEX)
+        ax.set_yticks(range(len(bp_top)))
+        ax.set_yticklabels(bp_top.index, fontsize=10)
+        ax.invert_yaxis()
+        for i, v in enumerate(bp_top.values):
+            ax.text(v + 0.1, i, str(v), va="center", fontsize=11, fontweight="bold")
+        ax.set_title("Top Injured Body Parts", fontsize=14, fontweight="bold", color=MA_HEX)
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+        p = os.path.join(output_dir, "body_parts.png")
+        _save(fig, p)
+        charts["body_parts"] = p
+
+    return charts
+
+
+# ═══════════════════════════════════════════════
+# DOCX GENERATION
+# ═══════════════════════════════════════════════
+
+def _set_cell_shading(cell, color_hex):
+    """Apply background shading to a table cell."""
+    shading = parse_xml(f'<w:shd {nsdecls("w")} w:fill="{color_hex}" w:val="clear"/>')
+    cell._tc.get_or_add_tcPr().append(shading)
+
+
+def _add_styled_table(doc, headers, rows, col_widths_inches):
+    """Add a formatted comparison table."""
+    table = doc.add_table(rows=1 + len(rows), cols=len(headers))
+    table.alignment = WD_TABLE_ALIGNMENT.LEFT
+    table.style = "Table Grid"
+
+    # Header row
+    for i, h in enumerate(headers):
+        cell = table.rows[0].cells[i]
+        cell.text = ""
+        p = cell.paragraphs[0]
+        run = p.add_run(h)
+        run.bold = True
+        run.font.size = Pt(9)
+        run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
+        run.font.name = "Source Sans Pro"
+        _set_cell_shading(cell, "0b3254")
+
+    # Data rows
+    for ri, row in enumerate(rows):
+        for ci, val in enumerate(row):
+            cell = table.rows[ri + 1].cells[ci]
+            cell.text = ""
+            p = cell.paragraphs[0]
+            run = p.add_run(str(val))
+            run.font.size = Pt(9)
+            run.font.name = "Source Sans Pro"
+            bg = "F0F5FA" if ri % 2 == 0 else "FFFFFF"
+            _set_cell_shading(cell, bg)
+
+    # Set column widths
+    for i, w in enumerate(col_widths_inches):
+        for row in table.rows:
+            row.cells[i].width = Inches(w)
+
+    return table
+
+
+def generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir):
+    """Generate the full DOCX report."""
+    doc = Document()
+
+    # Set default font
+    style = doc.styles["Normal"]
+    style.font.name = "Source Sans Pro"
+    style.font.size = Pt(11)
+
+    # Heading styles
+    for level, size, color in [(1, 16, NAVY), (2, 13, TEAL)]:
+        hs = doc.styles[f"Heading {level}"]
+        hs.font.name = "Source Sans Pro"
+        hs.font.size = Pt(size)
+        hs.font.color.rgb = color
+        hs.font.bold = True
+
+    pd1 = df[df["PD"] == "pd1"]
+    pd2 = df[df["PD"] == "pd2"]
+    total = len(df)
+    pd1_months = max(1, (pd.Timestamp(split_date) - df["Event Date"].min()).days / 30.44)
+    pd2_months = max(1, (df["Event Date"].max() - pd.Timestamp(split_date)).days / 30.44 + 1)
+
+    pd1_start = pd1["Event Date"].min().strftime("%b %Y") if len(pd1) > 0 else "N/A"
+    pd1_end = pd1["Event Date"].max().strftime("%b %Y") if len(pd1) > 0 else "N/A"
+    pd2_start = pd2["Event Date"].min().strftime("%b %Y") if len(pd2) > 0 else "N/A"
+    pd2_end = pd2["Event Date"].max().strftime("%b %Y") if len(pd2) > 0 else "N/A"
+
+    # ── Title page ──
+    doc.add_paragraph("")
+    doc.add_paragraph("")
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run("SHEQ Incident Analysis")
+    run.font.size = Pt(28)
+    run.bold = True
+    run.font.name = "Source Sans Pro"
+    run.font.color.rgb = NAVY
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run("Far North Waters Project")
+    run.font.size = Pt(16)
+    run.font.name = "Source Sans Pro"
+    run.font.color.rgb = TEAL
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run(f"{pd1_start} \u2013 {pd2_end} (MTD)")
+    run.font.size = Pt(14)
+    run.font.name = "Source Sans Pro"
+    run.font.color.rgb = TEAL
+
+    doc.add_paragraph("")
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run("Performance by Project Director")
+    run.font.size = Pt(13)
+    run.bold = True
+    run.font.name = "Source Sans Pro"
+    run.font.color.rgb = NAVY
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run(f"{pd1_name} ")
+    run.bold = True
+    run.font.color.rgb = NAVY
+    run = p.add_run(f"({pd1_start} \u2013 {pd1_end})   |   ")
+    run.font.color.rgb = GREY
+    run = p.add_run(f"{pd2_name} ")
+    run.bold = True
+    run.font.color.rgb = TEAL
+    run = p.add_run(f"({pd2_start} \u2013 {pd2_end})")
+    run.font.color.rgb = GREY
+
+    doc.add_paragraph("")
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    run = p.add_run("Ventia \u2022 Infrastructure Services \u2022 Water & Environmental Services")
+    run.font.size = Pt(10)
+    run.font.color.rgb = GREY
+
+    doc.add_page_break()
+
+    # ── Helper functions ──
+    def h1(text):
+        doc.add_heading(text, level=1)
+
+    def h2(text):
+        doc.add_heading(text, level=2)
+
+    def text(t, bold=False):
+        p = doc.add_paragraph()
+        run = p.add_run(t)
+        run.bold = bold
+        return p
+
+    def bullet(t):
+        p = doc.add_paragraph(t, style="List Bullet")
+        return p
+
+    def add_chart(name, width=5.5):
+        if name in charts:
+            doc.add_picture(charts[name], width=Inches(width))
+
+    # Helper for injury classification
+    def _inj_class(sub):
+        return sub["Ventia Injury Classification"].value_counts()
+
+    # ═══════════════════════════════════════════
+    # 1. EXECUTIVE SUMMARY
+    # ═══════════════════════════════════════════
+    h1("1. Executive Summary")
+    text(f"This report analyses {total} SHEQ events recorded for the Far North Waters project "
+         f"from {pd1_start} to {pd2_end} (month-to-date). The analysis is structured around "
+         f"two Project Director tenures to enable performance comparison:")
+
+    pd1_inj = pd1[pd1["Event Type"] == "Injury/Illness Sustained"]
+    pd2_inj = pd2[pd2["Event Type"] == "Injury/Illness Sustained"]
+    pd1_mv = pd1[pd1["Event Type"] == "Motor Vehicle"]
+    pd2_mv = pd2[pd2["Event Type"] == "Motor Vehicle"]
+    pd1_ic = _inj_class(pd1)
+    pd2_ic = _inj_class(pd2)
+    pd1_cc = len(pd1[pd1["Event Type"] == "Close Call"])
+    pd2_cc = len(pd2[pd2["Event Type"] == "Close Call"])
+    pd1_mod = len(pd1[pd1["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])
+    pd2_mod = len(pd2[pd2["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])
+
+    _add_styled_table(doc,
+        ["", pd1_name, pd2_name],
+        [
+            ["Period", f"{pd1_start} \u2013 {pd1_end}", f"{pd2_start} \u2013 {pd2_end}"],
+            ["Duration", f"{pd1_months:.0f} months", f"{pd2_months:.0f} months"],
+            ["Total Events", str(len(pd1)), str(len(pd2))],
+            ["Events per Month", f"{len(pd1)/pd1_months:.1f}", f"{len(pd2)/pd2_months:.1f}"],
+            ["Injuries", f"{len(pd1_inj)} ({len(pd1_inj)/max(len(pd1),1)*100:.1f}%)",
+                         f"{len(pd2_inj)} ({len(pd2_inj)/max(len(pd2),1)*100:.1f}%)"],
+            ["Motor Vehicle Events", f"{len(pd1_mv)} ({len(pd1_mv)/max(len(pd1),1)*100:.1f}%)",
+                                     f"{len(pd2_mv)} ({len(pd2_mv)/max(len(pd2),1)*100:.1f}%)"],
+            ["Lost Time Injuries", str(pd1_ic.get("Lost Time Injury", 0)), str(pd2_ic.get("Lost Time Injury", 0))],
+            ["First Aid Treatments", str(pd1_ic.get("First Aid Treatment", 0)), str(pd2_ic.get("First Aid Treatment", 0))],
+            ["Close Calls", f"{pd1_cc} ({pd1_cc/max(len(pd1),1)*100:.1f}%)",
+                            f"{pd2_cc} ({pd2_cc/max(len(pd2),1)*100:.1f}%)"],
+            ["Moderate+ Consequence", f"{pd1_mod} ({pd1_mod/max(len(pd1),1)*100:.1f}%)",
+                                      f"{pd2_mod} ({pd2_mod/max(len(pd2),1)*100:.1f}%)"],
+            ["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
+                                           f"{pd2['Days to Investigate'].dropna().median():.0f}"],
+            ["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
+                                     f"{pd2['Days to Close'].dropna().median():.0f}"],
+        ],
+        [2.0, 2.2, 2.3]
+    )
+
+    doc.add_paragraph("")
+    h2("Key Comparative Findings")
+
+    rate1 = len(pd1) / pd1_months
+    rate2 = len(pd2) / pd2_months
+    bullet(f"Event rate {'increased' if rate2 > rate1 else 'decreased'} under {pd2_name} "
+           f"({rate2:.1f}/month vs {rate1:.1f}/month), with Moderate+ consequences at "
+           f"{pd2_mod/max(len(pd2),1)*100:.1f}% vs {pd1_mod/max(len(pd1),1)*100:.1f}%.")
+    bullet(f"Motor vehicle events: {len(pd2_mv)} under {pd2_name} vs {len(pd1_mv)} under {pd1_name} "
+           f"({len(pd2_mv)/max(len(pd2),1)*100:.1f}% vs {len(pd1_mv)/max(len(pd1),1)*100:.1f}%).")
+    bullet(f"Close call reporting: {pd2_cc/max(len(pd2),1)*100:.1f}% under {pd2_name} vs "
+           f"{pd1_cc/max(len(pd1),1)*100:.1f}% under {pd1_name}.")
+
+    lti1 = pd1_ic.get("Lost Time Injury", 0)
+    lti2 = pd2_ic.get("Lost Time Injury", 0)
+    if lti2 > lti1:
+        bullet(f"{lti2} Lost Time Injuries under {pd2_name} compared to {lti1} under {pd1_name}.")
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 2. MONTHLY TRENDS
+    # ═══════════════════════════════════════════
+    h1("2. Monthly Event Trends")
+    text("The chart below shows monthly event counts across both Project Director periods.")
+    add_chart("monthly_by_pd", 5.8)
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 3. EVENT TYPE COMPARISON
+    # ═══════════════════════════════════════════
+    h1("3. Event Type Comparison")
+    add_chart("event_type_by_pd", 5.5)
+
+    evt_types = df["Event Type"].value_counts().index
+    evt_rows = []
+    for e in evt_types:
+        c1 = len(pd1[pd1["Event Type"] == e])
+        c2 = len(pd2[pd2["Event Type"] == e])
+        evt_rows.append([e, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
+                           str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
+    _add_styled_table(doc, ["Event Type", pd1_name, "%", pd2_name, "%"], evt_rows,
+                      [2.0, 1.1, 0.8, 1.0, 0.8])
+
+    doc.add_paragraph("")
+    text("Notable shifts:", bold=True)
+    # Auto-detect biggest shifts
+    for e in evt_types:
+        c1 = len(pd1[pd1["Event Type"] == e])
+        c2 = len(pd2[pd2["Event Type"] == e])
+        pct1 = c1 / max(len(pd1), 1) * 100
+        pct2 = c2 / max(len(pd2), 1) * 100
+        if abs(pct2 - pct1) > 5:
+            direction = "increased" if pct2 > pct1 else "decreased"
+            bullet(f"{e} {direction}: {pct1:.1f}% \u2192 {pct2:.1f}% ({c1} \u2192 {c2} events).")
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 4. INJURY ANALYSIS
+    # ═══════════════════════════════════════════
+    h1("4. Injury Analysis")
+    h2("4.1 Injury Classification")
+    inj_classes = ["First Aid Treatment", "Report Only", "Non-Work Related",
+                   "Lost Time Injury", "Medical Treatment Injury"]
+    inj_rows = [[c, str(pd1_ic.get(c, 0)), str(pd2_ic.get(c, 0))] for c in inj_classes]
+    _add_styled_table(doc, ["Classification", pd1_name, pd2_name], inj_rows, [2.5, 1.8, 1.8])
+
+    h2("4.2 Body Parts Injured")
+    add_chart("body_parts", 5.0)
+
+    # Body part comparison
+    bp1 = get_body_parts(pd1["Bodily Location"]).value_counts().head(6)
+    bp2 = get_body_parts(pd2["Bodily Location"]).value_counts().head(6)
+    all_bp = list(dict.fromkeys(list(bp1.index) + list(bp2.index)))[:8]
+    bp_rows = [[bp, str(bp1.get(bp, 0)), str(bp2.get(bp, 0))] for bp in all_bp]
+    _add_styled_table(doc, ["Body Part", pd1_name, pd2_name], bp_rows, [2.5, 1.8, 1.8])
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 5. CONSEQUENCE ANALYSIS
+    # ═══════════════════════════════════════════
+    h1("5. Consequence Analysis")
+    add_chart("consequence_by_pd", 5.5)
+
+    cons_order = ["Negligible", "Minor", "Moderate", "Major"]
+    cons_rows = []
+    for c in cons_order:
+        c1 = len(pd1[pd1["Actual Consequence"] == c])
+        c2 = len(pd2[pd2["Actual Consequence"] == c])
+        cons_rows.append([c, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
+                           str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
+    _add_styled_table(doc, ["Consequence", pd1_name, "%", pd2_name, "%"], cons_rows,
+                      [1.5, 1.0, 0.8, 1.0, 0.8])
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 6. CRP & ROOT CAUSE
+    # ═══════════════════════════════════════════
+    h1("6. Critical Risk Protocols & Root Causes")
+    h2("6.1 CRP Comparison")
+    add_chart("crp_by_pd", 5.5)
+
+    h2("6.2 Root Cause Comparison")
+    add_chart("rootcause_by_pd", 5.5)
+
+    rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
+    rc_rows = []
+    for r in rc_cats:
+        c1 = len(pd1[pd1["Root Cause Category"] == r])
+        c2 = len(pd2[pd2["Root Cause Category"] == r])
+        t1 = pd1["Root Cause Category"].notna().sum()
+        t2 = pd2["Root Cause Category"].notna().sum()
+        rc_rows.append([r, str(c1), f"{c1/max(t1,1)*100:.1f}%",
+                          str(c2), f"{c2/max(t2,1)*100:.1f}%"])
+    _add_styled_table(doc, ["Root Cause", pd1_name, "%", pd2_name, "%"], rc_rows,
+                      [2.0, 1.1, 0.8, 1.0, 0.8])
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 7. TIMING PATTERNS
+    # ═══════════════════════════════════════════
+    h1("7. Timing Patterns")
+    add_chart("dow_by_pd", 5.5)
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 8. INVESTIGATION PERFORMANCE
+    # ═══════════════════════════════════════════
+    h1("8. Investigation Performance")
+    inv_rows = [
+        ["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
+                                       f"{pd2['Days to Investigate'].dropna().median():.0f}"],
+        ["Mean Days to Investigate", f"{pd1['Days to Investigate'].dropna().mean():.1f}",
+                                     f"{pd2['Days to Investigate'].dropna().mean():.1f}"],
+        ["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
+                                 f"{pd2['Days to Close'].dropna().median():.0f}"],
+        ["Mean Days to Close", f"{pd1['Days to Close'].dropna().mean():.1f}",
+                               f"{pd2['Days to Close'].dropna().mean():.1f}"],
+        ["Events Closed", f"{(pd1['Status']=='Closed').sum()} ({(pd1['Status']=='Closed').sum()/max(len(pd1),1)*100:.0f}%)",
+                          f"{(pd2['Status']=='Closed').sum()} ({(pd2['Status']=='Closed').sum()/max(len(pd2),1)*100:.0f}%)"],
+        ["Events Open", str((pd1["Status"] == "Open").sum()), str((pd2["Status"] == "Open").sum())],
+    ]
+    _add_styled_table(doc, ["Metric", pd1_name, pd2_name], inv_rows, [2.5, 1.8, 1.8])
+
+    doc.add_page_break()
+
+    # ═══════════════════════════════════════════
+    # 9. RECOMMENDATIONS
+    # ═══════════════════════════════════════════
+    h1("9. Key Findings & Recommendations")
+
+    h2(f"9.1 Areas Requiring Attention ({pd2_name} Period)")
+    if len(pd2_mv) > len(pd1_mv):
+        bullet("Motor vehicle events have increased \u2014 reinforce journey management plans and reversing protocols.")
+    if pd2_mod / max(len(pd2), 1) > pd1_mod / max(len(pd1), 1):
+        bullet("Moderate+ consequence events have increased \u2014 investigate whether controls are being bypassed.")
+    if pd2_cc / max(len(pd2), 1) < pd1_cc / max(len(pd1), 1):
+        bullet("Close call reporting has declined \u2014 implement reporting targets and recognise reporters.")
+    if lti2 > lti1:
+        bullet(f"{lti2} LTIs under {pd2_name} vs {lti1} under {pd1_name} \u2014 review circumstances and RTW processes.")
+
+    h2("9.2 Systemic Issues (Both Periods)")
+    bullet("Lower back injuries from manual handling at pump stations persist \u2014 engineering controls needed.")
+    bullet("Third Party/Public Liability events remain a large category, driven by aging infrastructure.")
+    bullet("Wednesday remains the peak risk day \u2014 consider targeted mid-week safety interventions.")
+
+    h2("9.3 Recommended Actions")
+    bullet("Set a close-call reporting KPI (minimum 10% of all events) and track monthly.")
+    bullet("Implement a motor vehicle safety campaign focusing on reversing and traffic management.")
+    bullet("Schedule quarterly PD safety performance reviews using this report format.")
+
+    # ── Save ──
+    output_path = os.path.join(output_dir, "SHEQ_PD_Comparison.docx")
+    doc.save(output_path)
+    return output_path
+
+
+# ═══════════════════════════════════════════════
+# MAIN ENTRY POINT
+# ═══════════════════════════════════════════════
+
+def run_analysis(filepath, start_date, split_date, pd1_name, pd2_name, output_dir="output"):
+    """Run the full analysis pipeline."""
+    os.makedirs(output_dir, exist_ok=True)
+
+    print(f"Loading data from {filepath}...")
+    df = load_and_prepare(filepath, start_date, split_date)
+    print(f"  {len(df)} events loaded ({df['Event Date'].min().date()} to {df['Event Date'].max().date()})")
+    print(f"  {pd1_name}: {(df['PD']=='pd1').sum()} events")
+    print(f"  {pd2_name}: {(df['PD']=='pd2').sum()} events")
+
+    print("Generating charts...")
+    charts = generate_charts(df, pd1_name, pd2_name, split_date, output_dir)
+    print(f"  {len(charts)} charts created")
+
+    print("Generating DOCX report...")
+    docx_path = generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir)
+    print(f"  Report saved to {docx_path}")
+
+    return docx_path
+
+
+if __name__ == "__main__":
+    run_analysis(
+        filepath="All_Events__5_.xlsx",
+        start_date="2024-01-01",
+        split_date="2025-04-01",
+        pd1_name="Matthew Arthur",
+        pd2_name="Manga",
+        output_dir="output"
+    )