sheq-analysis-tool/analysis.py

"""
SHEQ Incident Analysis Engine
Generates charts and a DOCX report comparing two Project Director periods.

Usage:
    from analysis import run_analysis
    run_analysis("All_Events__5_.xlsx", "2024-01-01", "2025-04-01",
                 "Matthew Arthur", "Manga", output_dir="output")
"""

import os
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from docx import Document
from docx.shared import Inches, Pt, Cm, RGBColor, Emu
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn, nsdecls
from docx.oxml import parse_xml
from io import BytesIO


# ── Brand Colours (see DESIGN.md) ──
# Primary
DEEP_BLUE = RGBColor(0x0B, 0x32, 0x54)
SKY_BLUE = RGBColor(0x13, 0xB5, 0xEA)
# Secondary
DARK_GREEN = RGBColor(0x00, 0x6E, 0x47)
MID_GREEN = RGBColor(0x00, 0x99, 0x46)
LIGHT_GREEN = RGBColor(0x7B, 0xC1, 0x43)
PURPLE = RGBColor(0x96, 0x35, 0x8D)
# Functional
GREY = RGBColor(0x64, 0x74, 0x8B)

# Aliases used throughout
NAVY = DEEP_BLUE
TEAL = SKY_BLUE
GREEN = DARK_GREEN

# Hex versions for matplotlib
DEEP_BLUE_HEX = "#0b3254"
SKY_BLUE_HEX = "#13b5ea"
DARK_GREEN_HEX = "#006e47"
MID_GREEN_HEX = "#009946"
LIGHT_GREEN_HEX = "#7bc143"
PURPLE_HEX = "#96358d"
AMBER_HEX = "#d97706"
RED_HEX = "#dc2626"

# Chart palette sequence per DESIGN.md
CHART_PALETTE = [DEEP_BLUE_HEX, SKY_BLUE_HEX, DARK_GREEN_HEX, MID_GREEN_HEX,
                 LIGHT_GREEN_HEX, PURPLE_HEX, AMBER_HEX, RED_HEX]

# PD comparison colours
MA_HEX = DEEP_BLUE_HEX   # PD1 = Deep Blue
MG_HEX = SKY_BLUE_HEX    # PD2 = Sky Blue


# ═══════════════════════════════════════════════
# DATA LOADING & PREPARATION
# ═══════════════════════════════════════════════

def load_and_prepare(filepath, start_date, split_date):
    """Load Excel, filter by date range, add PD column."""
    df = pd.read_excel(filepath)
    df["Event Date"] = pd.to_datetime(df["Event Date"])
    df = df[df["Event Date"] >= pd.Timestamp(start_date)].copy()
    df["Year"] = df["Event Date"].dt.year
    df["Month"] = df["Event Date"].dt.month
    df["MonthName"] = df["Event Date"].dt.strftime("%b")
    df["DOW"] = df["Event Date"].dt.day_name()
    df["YearMonth"] = df["Event Date"].dt.to_period("M")
    df["PD"] = df["Event Date"].apply(
        lambda x: "pd1" if x < pd.Timestamp(split_date) else "pd2"
    )
    return df


def get_body_parts(series):
    """Split multi-value body part entries and normalise."""
    parts = []
    for val in series.dropna():
        for part in str(val).split(","):
            part = part.strip()
            if part and "unspecified" not in part.lower():
                parts.append(part)
    return pd.Series(parts)


# ═══════════════════════════════════════════════
# CHART GENERATION
# ═══════════════════════════════════════════════

def _save(fig, path):
    fig.tight_layout()
    fig.savefig(path, dpi=200, bbox_inches="tight", facecolor="white")
    plt.close(fig)


def _setup_chart_style():
    """Configure matplotlib to use Source Sans Pro if available."""
    import matplotlib.font_manager as fm
    available = [f.name for f in fm.fontManager.ttflist]
    if "Source Sans Pro" in available:
        plt.rcParams["font.family"] = "Source Sans Pro"
    elif "Source Sans 3" in available:
        plt.rcParams["font.family"] = "Source Sans 3"
    else:
        plt.rcParams["font.family"] = "sans-serif"


def generate_charts(df, pd1_name, pd2_name, split_date, output_dir):
    """Generate all comparison charts, return dict of paths."""
    _setup_chart_style()
    charts = {}
    pd1 = df[df["PD"] == "pd1"]
    pd2 = df[df["PD"] == "pd2"]

    # Consequence severity colours per DESIGN.md
    CONS_COLORS = [DARK_GREEN_HEX, AMBER_HEX, RED_HEX, PURPLE_HEX]

    # 1. Monthly trend by PD
    fig, ax = plt.subplots(figsize=(10, 4))
    start_period = df["Event Date"].min().to_period("M")
    end_period = df["Event Date"].max().to_period("M")
    months_all = pd.period_range(start_period, end_period, freq="M")
    monthly = df.groupby(["YearMonth", "PD"]).size().unstack(fill_value=0).reindex(months_all, fill_value=0)
    x = range(len(months_all))
    labels = [m.strftime("%b %y") for m in months_all]
    ma_vals = monthly.get("pd1", pd.Series(0, index=months_all)).values
    mg_vals = monthly.get("pd2", pd.Series(0, index=months_all)).values
    ax.bar(x, ma_vals, color=MA_HEX, label=pd1_name, width=0.7, alpha=0.9)
    ax.bar(x, mg_vals, bottom=ma_vals, color=MG_HEX, label=pd2_name, width=0.7, alpha=0.9)
    split_m = pd.Timestamp(split_date).to_period("M")
    if split_m in months_all:
        trans_idx = list(months_all).index(split_m)
        ax.axvline(x=trans_idx - 0.5, color=RED_HEX, linestyle="--", linewidth=1.5, alpha=0.7)
        ax.text(trans_idx - 0.3, max(max(ma_vals + mg_vals), 1) * 0.95, "PD Transition",
                fontsize=9, color=RED_HEX, ha="left")
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=8)
    ax.set_title("Monthly Events by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
    ax.set_ylabel("Events")
    ax.legend(loc="upper right")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    p = os.path.join(output_dir, "monthly_by_pd.png")
    _save(fig, p)
    charts["monthly_by_pd"] = p

    # 2. Event type comparison
    evt_types = df["Event Type"].value_counts().index[:8]
    ma_evt = pd1["Event Type"].value_counts().reindex(evt_types, fill_value=0)
    mg_evt = pd2["Event Type"].value_counts().reindex(evt_types, fill_value=0)
    fig, ax = plt.subplots(figsize=(9, 5))
    y = np.arange(len(evt_types))
    h = 0.35
    ax.barh(y - h / 2, ma_evt.values, h, label=pd1_name, color=MA_HEX)
    ax.barh(y + h / 2, mg_evt.values, h, label=pd2_name, color=MG_HEX)
    ax.set_yticks(y)
    ax.set_yticklabels(evt_types, fontsize=10)
    ax.invert_yaxis()
    ax.set_title("Event Types by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
    ax.legend()
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    for i, (v1, v2) in enumerate(zip(ma_evt.values, mg_evt.values)):
        ax.text(v1 + 0.2, i - h / 2, str(v1), va="center", fontsize=9, color=MA_HEX)
        ax.text(v2 + 0.2, i + h / 2, str(v2), va="center", fontsize=9, color=MG_HEX)
    p = os.path.join(output_dir, "event_type_by_pd.png")
    _save(fig, p)
    charts["event_type_by_pd"] = p

    # 3. Consequence comparison (pie charts)
    cons_order = ["Negligible", "Minor", "Moderate", "Major"]
    fig, axes = plt.subplots(1, 2, figsize=(9, 3.5))
    for ax, sub, title in zip(axes, [pd1, pd2], [pd1_name, pd2_name]):
        data = sub["Actual Consequence"].value_counts().reindex(cons_order, fill_value=0)
        ax.pie(data.values, labels=cons_order, autopct="%1.0f%%", colors=CONS_COLORS, startangle=140,
               textprops={"fontsize": 9})
        ax.set_title(title, fontsize=13, fontweight="bold", color=MA_HEX)
    p = os.path.join(output_dir, "consequence_by_pd.png")
    _save(fig, p)
    charts["consequence_by_pd"] = p

    # 4. Day of week
    dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    fig, ax = plt.subplots(figsize=(9, 4))
    x_arr = np.arange(len(dow_order))
    w = 0.35
    ma_d = pd1["DOW"].value_counts().reindex(dow_order, fill_value=0)
    mg_d = pd2["DOW"].value_counts().reindex(dow_order, fill_value=0)
    b1 = ax.bar(x_arr - w / 2, ma_d.values, w, label=pd1_name, color=MA_HEX)
    b2 = ax.bar(x_arr + w / 2, mg_d.values, w, label=pd2_name, color=MG_HEX)
    ax.set_xticks(x_arr)
    ax.set_xticklabels([d[:3] for d in dow_order])
    ax.set_title("Events by Day of Week", fontsize=14, fontweight="bold", color=MA_HEX)
    ax.legend()
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    for b in b1:
        if b.get_height() > 0:
            ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
                    ha="center", fontsize=9)
    for b in b2:
        if b.get_height() > 0:
            ax.text(b.get_x() + b.get_width() / 2, b.get_height() + 0.3, str(int(b.get_height())),
                    ha="center", fontsize=9)
    p = os.path.join(output_dir, "dow_by_pd.png")
    _save(fig, p)
    charts["dow_by_pd"] = p

    # 5. Root cause
    rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
    fig, ax = plt.subplots(figsize=(9, 4))
    y = np.arange(len(rc_cats))
    h = 0.35
    ma_rc = pd1["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
    mg_rc = pd2["Root Cause Category"].value_counts().reindex(rc_cats, fill_value=0)
    ax.barh(y - h / 2, ma_rc.values, h, label=pd1_name, color=MA_HEX)
    ax.barh(y + h / 2, mg_rc.values, h, label=pd2_name, color=MG_HEX)
    ax.set_yticks(y)
    ax.set_yticklabels(rc_cats, fontsize=10)
    ax.invert_yaxis()
    ax.set_title("Root Cause Categories by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
    ax.legend()
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    p = os.path.join(output_dir, "rootcause_by_pd.png")
    _save(fig, p)
    charts["rootcause_by_pd"] = p

    # 6. CRP comparison
    crp_all = df["CRPInvolved"].value_counts()
    crp_active = crp_all[~crp_all.index.isin(["None Identified", "Under Investigation"])].head(8)
    crp_cats = crp_active.index
    fig, ax = plt.subplots(figsize=(9, 4.5))
    y = np.arange(len(crp_cats))
    ma_c = pd1["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
    mg_c = pd2["CRPInvolved"].value_counts().reindex(crp_cats, fill_value=0)
    ax.barh(y - h / 2, ma_c.values, h, label=pd1_name, color=MA_HEX)
    ax.barh(y + h / 2, mg_c.values, h, label=pd2_name, color=MG_HEX)
    ax.set_yticks(y)
    ax.set_yticklabels(crp_cats, fontsize=9)
    ax.invert_yaxis()
    ax.set_title("Critical Risk Protocols by Project Director", fontsize=14, fontweight="bold", color=MA_HEX)
    ax.legend()
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    p = os.path.join(output_dir, "crp_by_pd.png")
    _save(fig, p)
    charts["crp_by_pd"] = p

    # 7. Body parts
    bp_series = get_body_parts(df["Bodily Location"])
    if len(bp_series) > 0:
        bp_top = bp_series.value_counts().head(10)
        fig, ax = plt.subplots(figsize=(8, 4))
        ax.barh(range(len(bp_top)), bp_top.values, color=DARK_GREEN_HEX)
        ax.set_yticks(range(len(bp_top)))
        ax.set_yticklabels(bp_top.index, fontsize=10)
        ax.invert_yaxis()
        for i, v in enumerate(bp_top.values):
            ax.text(v + 0.1, i, str(v), va="center", fontsize=11, fontweight="bold")
        ax.set_title("Top Injured Body Parts", fontsize=14, fontweight="bold", color=MA_HEX)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        p = os.path.join(output_dir, "body_parts.png")
        _save(fig, p)
        charts["body_parts"] = p

    return charts


# ═══════════════════════════════════════════════
# DOCX GENERATION
# ═══════════════════════════════════════════════

def _set_cell_shading(cell, color_hex):
    """Apply background shading to a table cell."""
    shading = parse_xml(f'<w:shd {nsdecls("w")} w:fill="{color_hex}" w:val="clear"/>')
    cell._tc.get_or_add_tcPr().append(shading)


def _add_styled_table(doc, headers, rows, col_widths_inches):
    """Add a formatted comparison table."""
    table = doc.add_table(rows=1 + len(rows), cols=len(headers))
    table.alignment = WD_TABLE_ALIGNMENT.LEFT
    table.style = "Table Grid"

    # Header row
    for i, h in enumerate(headers):
        cell = table.rows[0].cells[i]
        cell.text = ""
        p = cell.paragraphs[0]
        run = p.add_run(h)
        run.bold = True
        run.font.size = Pt(9)
        run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
        run.font.name = "Source Sans Pro"
        _set_cell_shading(cell, "0b3254")

    # Data rows
    for ri, row in enumerate(rows):
        for ci, val in enumerate(row):
            cell = table.rows[ri + 1].cells[ci]
            cell.text = ""
            p = cell.paragraphs[0]
            run = p.add_run(str(val))
            run.font.size = Pt(9)
            run.font.name = "Source Sans Pro"
            bg = "F0F5FA" if ri % 2 == 0 else "FFFFFF"
            _set_cell_shading(cell, bg)

    # Set column widths
    for i, w in enumerate(col_widths_inches):
        for row in table.rows:
            row.cells[i].width = Inches(w)

    return table


def generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir):
    """Generate the full DOCX report."""
    doc = Document()

    # Set default font
    style = doc.styles["Normal"]
    style.font.name = "Source Sans Pro"
    style.font.size = Pt(11)

    # Heading styles
    for level, size, color in [(1, 16, NAVY), (2, 13, TEAL)]:
        hs = doc.styles[f"Heading {level}"]
        hs.font.name = "Source Sans Pro"
        hs.font.size = Pt(size)
        hs.font.color.rgb = color
        hs.font.bold = True

    pd1 = df[df["PD"] == "pd1"]
    pd2 = df[df["PD"] == "pd2"]
    total = len(df)
    pd1_months = max(1, (pd.Timestamp(split_date) - df["Event Date"].min()).days / 30.44)
    pd2_months = max(1, (df["Event Date"].max() - pd.Timestamp(split_date)).days / 30.44 + 1)

    pd1_start = pd1["Event Date"].min().strftime("%b %Y") if len(pd1) > 0 else "N/A"
    pd1_end = pd1["Event Date"].max().strftime("%b %Y") if len(pd1) > 0 else "N/A"
    pd2_start = pd2["Event Date"].min().strftime("%b %Y") if len(pd2) > 0 else "N/A"
    pd2_end = pd2["Event Date"].max().strftime("%b %Y") if len(pd2) > 0 else "N/A"

    # ── Title page ──
    doc.add_paragraph("")
    doc.add_paragraph("")
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run("SHEQ Incident Analysis")
    run.font.size = Pt(28)
    run.bold = True
    run.font.name = "Source Sans Pro"
    run.font.color.rgb = NAVY

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run("Far North Waters Project")
    run.font.size = Pt(16)
    run.font.name = "Source Sans Pro"
    run.font.color.rgb = TEAL

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(f"{pd1_start} \u2013 {pd2_end} (MTD)")
    run.font.size = Pt(14)
    run.font.name = "Source Sans Pro"
    run.font.color.rgb = TEAL

    doc.add_paragraph("")
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run("Performance by Project Director")
    run.font.size = Pt(13)
    run.bold = True
    run.font.name = "Source Sans Pro"
    run.font.color.rgb = NAVY

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(f"{pd1_name} ")
    run.bold = True
    run.font.color.rgb = NAVY
    run = p.add_run(f"({pd1_start} \u2013 {pd1_end})   |   ")
    run.font.color.rgb = GREY
    run = p.add_run(f"{pd2_name} ")
    run.bold = True
    run.font.color.rgb = TEAL
    run = p.add_run(f"({pd2_start} \u2013 {pd2_end})")
    run.font.color.rgb = GREY

    doc.add_paragraph("")
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run("Ventia \u2022 Infrastructure Services \u2022 Water & Environmental Services")
    run.font.size = Pt(10)
    run.font.color.rgb = GREY

    doc.add_page_break()

    # ── Helper functions ──
    def h1(text):
        doc.add_heading(text, level=1)

    def h2(text):
        doc.add_heading(text, level=2)

    def text(t, bold=False):
        p = doc.add_paragraph()
        run = p.add_run(t)
        run.bold = bold
        return p

    def bullet(t):
        p = doc.add_paragraph(t, style="List Bullet")
        return p

    def add_chart(name, width=5.5):
        if name in charts:
            doc.add_picture(charts[name], width=Inches(width))

    # Helper for injury classification
    def _inj_class(sub):
        return sub["Ventia Injury Classification"].value_counts()

    # ═══════════════════════════════════════════
    # 1. EXECUTIVE SUMMARY
    # ═══════════════════════════════════════════
    h1("1. Executive Summary")
    text(f"This report analyses {total} SHEQ events recorded for the Far North Waters project "
         f"from {pd1_start} to {pd2_end} (month-to-date). The analysis is structured around "
         f"two Project Director tenures to enable performance comparison:")

    pd1_inj = pd1[pd1["Event Type"] == "Injury/Illness Sustained"]
    pd2_inj = pd2[pd2["Event Type"] == "Injury/Illness Sustained"]
    pd1_mv = pd1[pd1["Event Type"] == "Motor Vehicle"]
    pd2_mv = pd2[pd2["Event Type"] == "Motor Vehicle"]
    pd1_ic = _inj_class(pd1)
    pd2_ic = _inj_class(pd2)
    pd1_cc = len(pd1[pd1["Event Type"] == "Close Call"])
    pd2_cc = len(pd2[pd2["Event Type"] == "Close Call"])
    pd1_mod = len(pd1[pd1["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])
    pd2_mod = len(pd2[pd2["Actual Consequence"].isin(["Moderate", "Major", "Substantial"])])

    _add_styled_table(doc,
        ["", pd1_name, pd2_name],
        [
            ["Period", f"{pd1_start} \u2013 {pd1_end}", f"{pd2_start} \u2013 {pd2_end}"],
            ["Duration", f"{pd1_months:.0f} months", f"{pd2_months:.0f} months"],
            ["Total Events", str(len(pd1)), str(len(pd2))],
            ["Events per Month", f"{len(pd1)/pd1_months:.1f}", f"{len(pd2)/pd2_months:.1f}"],
            ["Injuries", f"{len(pd1_inj)} ({len(pd1_inj)/max(len(pd1),1)*100:.1f}%)",
                         f"{len(pd2_inj)} ({len(pd2_inj)/max(len(pd2),1)*100:.1f}%)"],
            ["Motor Vehicle Events", f"{len(pd1_mv)} ({len(pd1_mv)/max(len(pd1),1)*100:.1f}%)",
                                     f"{len(pd2_mv)} ({len(pd2_mv)/max(len(pd2),1)*100:.1f}%)"],
            ["Lost Time Injuries", str(pd1_ic.get("Lost Time Injury", 0)), str(pd2_ic.get("Lost Time Injury", 0))],
            ["First Aid Treatments", str(pd1_ic.get("First Aid Treatment", 0)), str(pd2_ic.get("First Aid Treatment", 0))],
            ["Close Calls", f"{pd1_cc} ({pd1_cc/max(len(pd1),1)*100:.1f}%)",
                            f"{pd2_cc} ({pd2_cc/max(len(pd2),1)*100:.1f}%)"],
            ["Moderate+ Consequence", f"{pd1_mod} ({pd1_mod/max(len(pd1),1)*100:.1f}%)",
                                      f"{pd2_mod} ({pd2_mod/max(len(pd2),1)*100:.1f}%)"],
            ["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
                                           f"{pd2['Days to Investigate'].dropna().median():.0f}"],
            ["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
                                     f"{pd2['Days to Close'].dropna().median():.0f}"],
        ],
        [2.0, 2.2, 2.3]
    )

    doc.add_paragraph("")
    h2("Key Comparative Findings")

    rate1 = len(pd1) / pd1_months
    rate2 = len(pd2) / pd2_months
    bullet(f"Event rate {'increased' if rate2 > rate1 else 'decreased'} under {pd2_name} "
           f"({rate2:.1f}/month vs {rate1:.1f}/month), with Moderate+ consequences at "
           f"{pd2_mod/max(len(pd2),1)*100:.1f}% vs {pd1_mod/max(len(pd1),1)*100:.1f}%.")
    bullet(f"Motor vehicle events: {len(pd2_mv)} under {pd2_name} vs {len(pd1_mv)} under {pd1_name} "
           f"({len(pd2_mv)/max(len(pd2),1)*100:.1f}% vs {len(pd1_mv)/max(len(pd1),1)*100:.1f}%).")
    bullet(f"Close call reporting: {pd2_cc/max(len(pd2),1)*100:.1f}% under {pd2_name} vs "
           f"{pd1_cc/max(len(pd1),1)*100:.1f}% under {pd1_name}.")

    lti1 = pd1_ic.get("Lost Time Injury", 0)
    lti2 = pd2_ic.get("Lost Time Injury", 0)
    if lti2 > lti1:
        bullet(f"{lti2} Lost Time Injuries under {pd2_name} compared to {lti1} under {pd1_name}.")

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 2. MONTHLY TRENDS
    # ═══════════════════════════════════════════
    h1("2. Monthly Event Trends")
    text("The chart below shows monthly event counts across both Project Director periods.")
    add_chart("monthly_by_pd", 5.8)
    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 3. EVENT TYPE COMPARISON
    # ═══════════════════════════════════════════
    h1("3. Event Type Comparison")
    add_chart("event_type_by_pd", 5.5)

    evt_types = df["Event Type"].value_counts().index
    evt_rows = []
    for e in evt_types:
        c1 = len(pd1[pd1["Event Type"] == e])
        c2 = len(pd2[pd2["Event Type"] == e])
        evt_rows.append([e, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
                           str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
    _add_styled_table(doc, ["Event Type", pd1_name, "%", pd2_name, "%"], evt_rows,
                      [2.0, 1.1, 0.8, 1.0, 0.8])

    doc.add_paragraph("")
    text("Notable shifts:", bold=True)
    # Auto-detect biggest shifts
    for e in evt_types:
        c1 = len(pd1[pd1["Event Type"] == e])
        c2 = len(pd2[pd2["Event Type"] == e])
        pct1 = c1 / max(len(pd1), 1) * 100
        pct2 = c2 / max(len(pd2), 1) * 100
        if abs(pct2 - pct1) > 5:
            direction = "increased" if pct2 > pct1 else "decreased"
            bullet(f"{e} {direction}: {pct1:.1f}% \u2192 {pct2:.1f}% ({c1} \u2192 {c2} events).")

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 4. INJURY ANALYSIS
    # ═══════════════════════════════════════════
    h1("4. Injury Analysis")
    h2("4.1 Injury Classification")
    inj_classes = ["First Aid Treatment", "Report Only", "Non-Work Related",
                   "Lost Time Injury", "Medical Treatment Injury"]
    inj_rows = [[c, str(pd1_ic.get(c, 0)), str(pd2_ic.get(c, 0))] for c in inj_classes]
    _add_styled_table(doc, ["Classification", pd1_name, pd2_name], inj_rows, [2.5, 1.8, 1.8])

    h2("4.2 Body Parts Injured")
    add_chart("body_parts", 5.0)

    # Body part comparison
    bp1 = get_body_parts(pd1["Bodily Location"]).value_counts().head(6)
    bp2 = get_body_parts(pd2["Bodily Location"]).value_counts().head(6)
    all_bp = list(dict.fromkeys(list(bp1.index) + list(bp2.index)))[:8]
    bp_rows = [[bp, str(bp1.get(bp, 0)), str(bp2.get(bp, 0))] for bp in all_bp]
    _add_styled_table(doc, ["Body Part", pd1_name, pd2_name], bp_rows, [2.5, 1.8, 1.8])

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 5. CONSEQUENCE ANALYSIS
    # ═══════════════════════════════════════════
    h1("5. Consequence Analysis")
    add_chart("consequence_by_pd", 5.5)

    cons_order = ["Negligible", "Minor", "Moderate", "Major"]
    cons_rows = []
    for c in cons_order:
        c1 = len(pd1[pd1["Actual Consequence"] == c])
        c2 = len(pd2[pd2["Actual Consequence"] == c])
        cons_rows.append([c, str(c1), f"{c1/max(len(pd1),1)*100:.1f}%",
                           str(c2), f"{c2/max(len(pd2),1)*100:.1f}%"])
    _add_styled_table(doc, ["Consequence", pd1_name, "%", pd2_name, "%"], cons_rows,
                      [1.5, 1.0, 0.8, 1.0, 0.8])

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 6. CRP & ROOT CAUSE
    # ═══════════════════════════════════════════
    h1("6. Critical Risk Protocols & Root Causes")
    h2("6.1 CRP Comparison")
    add_chart("crp_by_pd", 5.5)

    h2("6.2 Root Cause Comparison")
    add_chart("rootcause_by_pd", 5.5)

    rc_cats = ["External Factors", "People", "Production / Delivery", "Process", "Planning", "Providers"]
    rc_rows = []
    for r in rc_cats:
        c1 = len(pd1[pd1["Root Cause Category"] == r])
        c2 = len(pd2[pd2["Root Cause Category"] == r])
        t1 = pd1["Root Cause Category"].notna().sum()
        t2 = pd2["Root Cause Category"].notna().sum()
        rc_rows.append([r, str(c1), f"{c1/max(t1,1)*100:.1f}%",
                          str(c2), f"{c2/max(t2,1)*100:.1f}%"])
    _add_styled_table(doc, ["Root Cause", pd1_name, "%", pd2_name, "%"], rc_rows,
                      [2.0, 1.1, 0.8, 1.0, 0.8])

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 7. TIMING PATTERNS
    # ═══════════════════════════════════════════
    h1("7. Timing Patterns")
    add_chart("dow_by_pd", 5.5)

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 8. INVESTIGATION PERFORMANCE
    # ═══════════════════════════════════════════
    h1("8. Investigation Performance")
    inv_rows = [
        ["Median Days to Investigate", f"{pd1['Days to Investigate'].dropna().median():.0f}",
                                       f"{pd2['Days to Investigate'].dropna().median():.0f}"],
        ["Mean Days to Investigate", f"{pd1['Days to Investigate'].dropna().mean():.1f}",
                                     f"{pd2['Days to Investigate'].dropna().mean():.1f}"],
        ["Median Days to Close", f"{pd1['Days to Close'].dropna().median():.0f}",
                                 f"{pd2['Days to Close'].dropna().median():.0f}"],
        ["Mean Days to Close", f"{pd1['Days to Close'].dropna().mean():.1f}",
                               f"{pd2['Days to Close'].dropna().mean():.1f}"],
        ["Events Closed", f"{(pd1['Status']=='Closed').sum()} ({(pd1['Status']=='Closed').sum()/max(len(pd1),1)*100:.0f}%)",
                          f"{(pd2['Status']=='Closed').sum()} ({(pd2['Status']=='Closed').sum()/max(len(pd2),1)*100:.0f}%)"],
        ["Events Open", str((pd1["Status"] == "Open").sum()), str((pd2["Status"] == "Open").sum())],
    ]
    _add_styled_table(doc, ["Metric", pd1_name, pd2_name], inv_rows, [2.5, 1.8, 1.8])

    doc.add_page_break()

    # ═══════════════════════════════════════════
    # 9. RECOMMENDATIONS
    # ═══════════════════════════════════════════
    h1("9. Key Findings & Recommendations")

    h2(f"9.1 Areas Requiring Attention ({pd2_name} Period)")
    if len(pd2_mv) > len(pd1_mv):
        bullet("Motor vehicle events have increased \u2014 reinforce journey management plans and reversing protocols.")
    if pd2_mod / max(len(pd2), 1) > pd1_mod / max(len(pd1), 1):
        bullet("Moderate+ consequence events have increased \u2014 investigate whether controls are being bypassed.")
    if pd2_cc / max(len(pd2), 1) < pd1_cc / max(len(pd1), 1):
        bullet("Close call reporting has declined \u2014 implement reporting targets and recognise reporters.")
    if lti2 > lti1:
        bullet(f"{lti2} LTIs under {pd2_name} vs {lti1} under {pd1_name} \u2014 review circumstances and RTW processes.")

    h2("9.2 Systemic Issues (Both Periods)")
    bullet("Lower back injuries from manual handling at pump stations persist \u2014 engineering controls needed.")
    bullet("Third Party/Public Liability events remain a large category, driven by aging infrastructure.")
    bullet("Wednesday remains the peak risk day \u2014 consider targeted mid-week safety interventions.")

    h2("9.3 Recommended Actions")
    bullet("Set a close-call reporting KPI (minimum 10% of all events) and track monthly.")
    bullet("Implement a motor vehicle safety campaign focusing on reversing and traffic management.")
    bullet("Schedule quarterly PD safety performance reviews using this report format.")

    # ── Save ──
    output_path = os.path.join(output_dir, "SHEQ_PD_Comparison.docx")
    doc.save(output_path)
    return output_path


# ═══════════════════════════════════════════════
# MAIN ENTRY POINT
# ═══════════════════════════════════════════════

def run_analysis(filepath, start_date, split_date, pd1_name, pd2_name, output_dir="output"):
    """Run the full analysis pipeline."""
    os.makedirs(output_dir, exist_ok=True)

    print(f"Loading data from {filepath}...")
    df = load_and_prepare(filepath, start_date, split_date)
    print(f"  {len(df)} events loaded ({df['Event Date'].min().date()} to {df['Event Date'].max().date()})")
    print(f"  {pd1_name}: {(df['PD']=='pd1').sum()} events")
    print(f"  {pd2_name}: {(df['PD']=='pd2').sum()} events")

    print("Generating charts...")
    charts = generate_charts(df, pd1_name, pd2_name, split_date, output_dir)
    print(f"  {len(charts)} charts created")

    print("Generating DOCX report...")
    docx_path = generate_docx(df, pd1_name, pd2_name, split_date, charts, output_dir)
    print(f"  Report saved to {docx_path}")

    return docx_path


if __name__ == "__main__":
    run_analysis(
        filepath="All_Events__5_.xlsx",
        start_date="2024-01-01",
        split_date="2025-04-01",
        pd1_name="Matthew Arthur",
        pd2_name="Manga",
        output_dir="output"
    )