app.py

"""
app.py — SHEQ Analysis Tool — Flask web application.

Run:
    python app.py
Then open http://localhost:5000

The sidebar has two sections:
  1. Events Explorer — filter and chart Events data interactively.
  2. Generate Report  — run the full analysis across Events, Safety Energy,
     and LLC Data and download a comprehensive DOCX report.
"""

from __future__ import annotations

import logging
import os
from datetime import datetime

import pandas as pd
from flask import Flask, jsonify, render_template, request, send_file

from config import (
    EVENTS_FILE, LLC_FILE, SAFETY_ENERGY_FILE,
    DEFAULT_PD1_NAME, DEFAULT_PD2_NAME,
    DEFAULT_START_DATE, DEFAULT_SPLIT_DATE,
    OUTPUT_DIR,
)
from data_loader import load_all, get_body_parts, load_and_prepare
from analysis_engine import run_full_analysis
from report_builder import build_report
from ppt_builder import build_presentation

# ── Logging ──────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(name)s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("app")

app = Flask(__name__)

# ── Cached raw DataFrames (loaded on first request) ──────────────────────────
_CACHE: dict[str, pd.DataFrame | None] = {
    "events": None,
}


def _get_events_df() -> pd.DataFrame:
    """Return the raw Events DataFrame, loading from disk on first call."""
    if _CACHE["events"] is None:
        log.info("Loading Events from %s", EVENTS_FILE)
        df = pd.read_excel(EVENTS_FILE)
        # Normalise date column — handle "Monday, 25 March 2024" and ISO formats
        date_col = "EventDate" if "EventDate" in df.columns else "Event Date"
        df["_date"] = df[date_col].apply(_parse_one_date)
        _CACHE["events"] = df
    return _CACHE["events"].copy()


def _parse_one_date(val) -> pd.Timestamp:
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    if "," in s and len(s.split(",")[0].split()) == 1:
        s = s.split(",", 1)[1].strip()
    try:
        return pd.to_datetime(s, dayfirst=True)
    except Exception:
        return pd.NaT


# ─────────────────────────────────────────────────────────────────────────────
# Web UI
# ─────────────────────────────────────────────────────────────────────────────

@app.route("/")
def index():
    df = _get_events_df()
    min_date = df["_date"].min().strftime("%Y-%m-%d")
    max_date = df["_date"].max().strftime("%Y-%m-%d")

    # Handle both column name variants
    evt_col  = "EventType" if "EventType" in df.columns else "Event Type"
    cons_col = "Actual Consequence"

    event_types  = sorted(df[evt_col].dropna().unique().tolist())
    consequences = sorted(df[cons_col].dropna().unique().tolist())

    return render_template(
        "index.html",
        min_date=min_date,
        max_date=max_date,
        event_types=event_types,
        consequences=consequences,
        total_events=len(df),
    )


# ─────────────────────────────────────────────────────────────────────────────
# Events Explorer API
# ─────────────────────────────────────────────────────────────────────────────

@app.route("/api/filter", methods=["POST"])
def api_filter():
    """Return filtered summary stats as JSON for the Events Explorer."""
    params = request.json or {}
    df = _get_events_df()

    evt_col  = "EventType" if "EventType" in df.columns else "Event Type"
    cons_col = "Actual Consequence"
    crp_col  = "CRP Involved" if "CRP Involved" in df.columns else "CRPInvolved"
    rc_col   = "Root Cause Category"
    inj_col  = "Ventia Injury Classification"
    bp_col   = "Bodily Location"

    # Filters
    if params.get("start_date"):
        df = df[df["_date"] >= pd.Timestamp(params["start_date"])]
    if params.get("end_date"):
        df = df[df["_date"] <= pd.Timestamp(params["end_date"])]
    if params.get("event_types"):
        df = df[df[evt_col].isin(params["event_types"])]
    if params.get("consequences"):
        df = df[df[cons_col].isin(params["consequences"])]

    if len(df) == 0:
        return jsonify({"error": "No events match the selected filters.", "total": 0})

    # Summary stats
    evt_counts  = df[evt_col].value_counts().to_dict()
    cons_counts = df[cons_col].value_counts().to_dict()

    inj_class = (
        df[inj_col].value_counts().to_dict()
        if inj_col in df.columns else {}
    )

    dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    dow = (
        df["_date"].dt.day_name().value_counts()
        .reindex(dow_order, fill_value=0).to_dict()
    )

    monthly = df.groupby(df["_date"].dt.to_period("M")).size()
    monthly_data = {str(k): int(v) for k, v in monthly.items()}

    bp = (
        get_body_parts(df[bp_col]).value_counts().head(10).to_dict()
        if bp_col in df.columns else {}
    )

    rc = (
        df[rc_col].value_counts().to_dict()
        if rc_col in df.columns else {}
    )

    crp: dict = {}
    if crp_col in df.columns:
        crp = df[crp_col].value_counts().to_dict()
        crp.pop("None Identified", None)
        crp.pop("Under Investigation", None)

    # Investigation performance — use available columns
    lag_col     = next((c for c in ("Days to Investigate", "Event Lag", "Days to Enter")
                        if c in df.columns), None)
    close_col   = "Days to Close" if "Days to Close" in df.columns else None

    inv_med  = df[lag_col].dropna().median()  if lag_col   else None
    close_med = df[close_col].dropna().median() if close_col else None

    return jsonify({
        "total": len(df),
        "date_range": (
            f"{df['_date'].min().strftime('%d %b %Y')} "
            f"\u2013 {df['_date'].max().strftime('%d %b %Y')}"
        ),
        "event_types":          evt_counts,
        "consequences":         cons_counts,
        "injury_classification": inj_class,
        "day_of_week":          dow,
        "monthly":              monthly_data,
        "body_parts":           bp,
        "root_causes":          rc,
        "crp":                  crp,
        "median_investigate_days": round(inv_med, 1)   if inv_med   and pd.notna(inv_med)   else None,
        "median_close_days":       round(close_med, 1) if close_med and pd.notna(close_med) else None,
        "closed_pct": round(
            (df["Status"] == "Closed").sum() / len(df) * 100, 1
        ) if "Status" in df.columns else None,
    })


# ─────────────────────────────────────────────────────────────────────────────
# Comprehensive Report API
# ─────────────────────────────────────────────────────────────────────────────

@app.route("/api/generate_full_report", methods=["POST"])
def api_generate_full_report():
    """
    Load all three data sources, run the full analysis pipeline, build the
    DOCX report, and return it as a file download.
    """
    params = request.json or {}

    start_date = params.get("start_date", DEFAULT_START_DATE)
    export_format = str(params.get("export_format", "docx")).lower()
    if export_format not in {"docx", "pptx"}:
        return jsonify({"success": False, "error": "Supported export formats are DOCX and PPTX."}), 400

    events_path = params.get("events_file",        EVENTS_FILE)
    se_path     = params.get("safety_energy_file",  SAFETY_ENERGY_FILE)
    llc_path    = params.get("llc_file",            LLC_FILE)

    run_id     = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir    = os.path.join(OUTPUT_DIR, run_id)

    try:
        log.info("Starting full report generation (run_id=%s)", run_id)
        data = load_all(events_path, se_path, llc_path)

        results = run_full_analysis(
            events        = data["events"],
            safety_energy = data["safety_energy"],
            llc           = data["llc"],
            start_date    = start_date,
            split_date    = DEFAULT_SPLIT_DATE,
            pd1_name      = DEFAULT_PD1_NAME,
            pd2_name      = DEFAULT_PD2_NAME,
            output_dir    = run_dir,
        )

        if export_format == "pptx":
            report_path = build_presentation(results, run_dir)
            download_name = f"SHEQ_Safety_Performance_{run_id}.pptx"
            mimetype = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
        else:
            report_path = build_report(results, run_dir)
            download_name = f"SHEQ_Safety_Performance_{run_id}.docx"
            mimetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        log.info("Report ready: %s", report_path)

        return send_file(
            report_path,
            as_attachment=True,
            download_name=download_name,
            mimetype=mimetype,
        )

    except FileNotFoundError as e:
        log.error("File not found: %s", e)
        return jsonify({"success": False, "error": str(e)}), 404
    except Exception as e:
        log.exception("Report generation failed")
        return jsonify({"success": False, "error": str(e)}), 500


# ── Legacy endpoint (kept for backwards compatibility) ────────────────────────

@app.route("/api/download_report", methods=["POST"])
def api_download_report():
    """Legacy Events-only PD comparison report (preserved from v1)."""
    params     = request.json or {}
    start_date = params.get("start_date", DEFAULT_START_DATE)
    split_date = params.get("split_date", DEFAULT_SPLIT_DATE)
    pd1_name   = params.get("pd1_name",   DEFAULT_PD1_NAME)
    pd2_name   = params.get("pd2_name",   DEFAULT_PD2_NAME)

    run_dir = os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))

    try:
        from analysis import run_analysis
        docx_path = run_analysis(
            EVENTS_FILE, start_date, split_date, pd1_name, pd2_name, run_dir
        )
        return send_file(
            docx_path, as_attachment=True,
            download_name="SHEQ_PD_Comparison.docx",
        )
    except Exception as e:
        log.exception("Legacy report generation failed")
        return jsonify({"success": False, "error": str(e)}), 500


# ─────────────────────────────────────────────────────────────────────────────
# Entry point
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log.info("SHEQ Analysis Tool starting on http://localhost:5000")
    log.info("  Events:        %s", EVENTS_FILE)
    log.info("  Safety Energy: %s", SAFETY_ENERGY_FILE)
    log.info("  LLC Data:      %s", LLC_FILE)
    app.run(debug=True, port=5000)
push 2026-04-20 15:23:18 +12:00			`"""`
			`app.py — SHEQ Analysis Tool — Flask web application.`

			`Run:`
			`python app.py`
			`Then open http://localhost:5000`

			`The sidebar has two sections:`
			`1. Events Explorer — filter and chart Events data interactively.`
			`2. Generate Report — run the full analysis across Events, Safety Energy,`
			`and LLC Data and download a comprehensive DOCX report.`
			`"""`

			`from __future__ import annotations`

			`import logging`
			`import os`
			`from datetime import datetime`

			`import pandas as pd`
			`from flask import Flask, jsonify, render_template, request, send_file`

			`from config import (`
			`EVENTS_FILE, LLC_FILE, SAFETY_ENERGY_FILE,`
			`DEFAULT_PD1_NAME, DEFAULT_PD2_NAME,`
			`DEFAULT_START_DATE, DEFAULT_SPLIT_DATE,`
			`OUTPUT_DIR,`
			`)`
			`from data_loader import load_all, get_body_parts, load_and_prepare`
			`from analysis_engine import run_full_analysis`
			`from report_builder import build_report`
			`from ppt_builder import build_presentation`

			`# ── Logging ──────────────────────────────────────────────────────────────────`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format="%(asctime)s %(levelname)-8s %(name)s %(message)s",`
			`datefmt="%H:%M:%S",`
			`)`
			`log = logging.getLogger("app")`

			`app = Flask(__name__)`

			`# ── Cached raw DataFrames (loaded on first request) ──────────────────────────`
			`_CACHE: dict[str, pd.DataFrame \| None] = {`
			`"events": None,`
			`}`


			`def _get_events_df() -> pd.DataFrame:`
			`"""Return the raw Events DataFrame, loading from disk on first call."""`
			`if _CACHE["events"] is None:`
			`log.info("Loading Events from %s", EVENTS_FILE)`
			`df = pd.read_excel(EVENTS_FILE)`
			`# Normalise date column — handle "Monday, 25 March 2024" and ISO formats`
			`date_col = "EventDate" if "EventDate" in df.columns else "Event Date"`
			`df["_date"] = df[date_col].apply(_parse_one_date)`
			`_CACHE["events"] = df`
			`return _CACHE["events"].copy()`


			`def _parse_one_date(val) -> pd.Timestamp:`
			`if pd.isna(val):`
			`return pd.NaT`
			`s = str(val).strip()`
			`if "," in s and len(s.split(",")[0].split()) == 1:`
			`s = s.split(",", 1)[1].strip()`
			`try:`
			`return pd.to_datetime(s, dayfirst=True)`
			`except Exception:`
			`return pd.NaT`


			`# ─────────────────────────────────────────────────────────────────────────────`
			`# Web UI`
			`# ─────────────────────────────────────────────────────────────────────────────`

			`@app.route("/")`
			`def index():`
			`df = _get_events_df()`
			`min_date = df["_date"].min().strftime("%Y-%m-%d")`
			`max_date = df["_date"].max().strftime("%Y-%m-%d")`

			`# Handle both column name variants`
			`evt_col = "EventType" if "EventType" in df.columns else "Event Type"`
			`cons_col = "Actual Consequence"`

			`event_types = sorted(df[evt_col].dropna().unique().tolist())`
			`consequences = sorted(df[cons_col].dropna().unique().tolist())`

			`return render_template(`
			`"index.html",`
			`min_date=min_date,`
			`max_date=max_date,`
			`event_types=event_types,`
			`consequences=consequences,`
			`total_events=len(df),`
			`)`


			`# ─────────────────────────────────────────────────────────────────────────────`
			`# Events Explorer API`
			`# ─────────────────────────────────────────────────────────────────────────────`

			`@app.route("/api/filter", methods=["POST"])`
			`def api_filter():`
			`"""Return filtered summary stats as JSON for the Events Explorer."""`
			`params = request.json or {}`
			`df = _get_events_df()`

			`evt_col = "EventType" if "EventType" in df.columns else "Event Type"`
			`cons_col = "Actual Consequence"`
			`crp_col = "CRP Involved" if "CRP Involved" in df.columns else "CRPInvolved"`
			`rc_col = "Root Cause Category"`
			`inj_col = "Ventia Injury Classification"`
			`bp_col = "Bodily Location"`

			`# Filters`
			`if params.get("start_date"):`
			`df = df[df["_date"] >= pd.Timestamp(params["start_date"])]`
			`if params.get("end_date"):`
			`df = df[df["_date"] <= pd.Timestamp(params["end_date"])]`
			`if params.get("event_types"):`
			`df = df[df[evt_col].isin(params["event_types"])]`
			`if params.get("consequences"):`
			`df = df[df[cons_col].isin(params["consequences"])]`

			`if len(df) == 0:`
			`return jsonify({"error": "No events match the selected filters.", "total": 0})`

			`# Summary stats`
			`evt_counts = df[evt_col].value_counts().to_dict()`
			`cons_counts = df[cons_col].value_counts().to_dict()`

			`inj_class = (`
			`df[inj_col].value_counts().to_dict()`
			`if inj_col in df.columns else {}`
			`)`

			`dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]`
			`dow = (`
			`df["_date"].dt.day_name().value_counts()`
			`.reindex(dow_order, fill_value=0).to_dict()`
			`)`

			`monthly = df.groupby(df["_date"].dt.to_period("M")).size()`
			`monthly_data = {str(k): int(v) for k, v in monthly.items()}`

			`bp = (`
			`get_body_parts(df[bp_col]).value_counts().head(10).to_dict()`
			`if bp_col in df.columns else {}`
			`)`

			`rc = (`
			`df[rc_col].value_counts().to_dict()`
			`if rc_col in df.columns else {}`
			`)`

			`crp: dict = {}`
			`if crp_col in df.columns:`
			`crp = df[crp_col].value_counts().to_dict()`
			`crp.pop("None Identified", None)`
			`crp.pop("Under Investigation", None)`

			`# Investigation performance — use available columns`
			`lag_col = next((c for c in ("Days to Investigate", "Event Lag", "Days to Enter")`
			`if c in df.columns), None)`
			`close_col = "Days to Close" if "Days to Close" in df.columns else None`

			`inv_med = df[lag_col].dropna().median() if lag_col else None`
			`close_med = df[close_col].dropna().median() if close_col else None`

			`return jsonify({`
			`"total": len(df),`
			`"date_range": (`
			`f"{df['_date'].min().strftime('%d %b %Y')} "`
			`f"\u2013 {df['_date'].max().strftime('%d %b %Y')}"`
			`),`
			`"event_types": evt_counts,`
			`"consequences": cons_counts,`
			`"injury_classification": inj_class,`
			`"day_of_week": dow,`
			`"monthly": monthly_data,`
			`"body_parts": bp,`
			`"root_causes": rc,`
			`"crp": crp,`
			`"median_investigate_days": round(inv_med, 1) if inv_med and pd.notna(inv_med) else None,`
			`"median_close_days": round(close_med, 1) if close_med and pd.notna(close_med) else None,`
			`"closed_pct": round(`
			`(df["Status"] == "Closed").sum() / len(df) * 100, 1`
			`) if "Status" in df.columns else None,`
			`})`


			`# ─────────────────────────────────────────────────────────────────────────────`
			`# Comprehensive Report API`
			`# ─────────────────────────────────────────────────────────────────────────────`

			`@app.route("/api/generate_full_report", methods=["POST"])`
			`def api_generate_full_report():`
			`"""`
			`Load all three data sources, run the full analysis pipeline, build the`
			`DOCX report, and return it as a file download.`
			`"""`
			`params = request.json or {}`

			`start_date = params.get("start_date", DEFAULT_START_DATE)`
			`export_format = str(params.get("export_format", "docx")).lower()`
			`if export_format not in {"docx", "pptx"}:`
			`return jsonify({"success": False, "error": "Supported export formats are DOCX and PPTX."}), 400`

			`events_path = params.get("events_file", EVENTS_FILE)`
			`se_path = params.get("safety_energy_file", SAFETY_ENERGY_FILE)`
			`llc_path = params.get("llc_file", LLC_FILE)`

			`run_id = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`run_dir = os.path.join(OUTPUT_DIR, run_id)`

			`try:`
			`log.info("Starting full report generation (run_id=%s)", run_id)`
			`data = load_all(events_path, se_path, llc_path)`

			`results = run_full_analysis(`
			`events = data["events"],`
			`safety_energy = data["safety_energy"],`
			`llc = data["llc"],`
			`start_date = start_date,`
			`split_date = DEFAULT_SPLIT_DATE,`
			`pd1_name = DEFAULT_PD1_NAME,`
			`pd2_name = DEFAULT_PD2_NAME,`
			`output_dir = run_dir,`
			`)`

			`if export_format == "pptx":`
			`report_path = build_presentation(results, run_dir)`
			`download_name = f"SHEQ_Safety_Performance_{run_id}.pptx"`
			`mimetype = "application/vnd.openxmlformats-officedocument.presentationml.presentation"`
			`else:`
			`report_path = build_report(results, run_dir)`
			`download_name = f"SHEQ_Safety_Performance_{run_id}.docx"`
			`mimetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"`
			`log.info("Report ready: %s", report_path)`

			`return send_file(`
			`report_path,`
			`as_attachment=True,`
			`download_name=download_name,`
			`mimetype=mimetype,`
			`)`

			`except FileNotFoundError as e:`
			`log.error("File not found: %s", e)`
			`return jsonify({"success": False, "error": str(e)}), 404`
			`except Exception as e:`
			`log.exception("Report generation failed")`
			`return jsonify({"success": False, "error": str(e)}), 500`


			`# ── Legacy endpoint (kept for backwards compatibility) ────────────────────────`

			`@app.route("/api/download_report", methods=["POST"])`
			`def api_download_report():`
			`"""Legacy Events-only PD comparison report (preserved from v1)."""`
			`params = request.json or {}`
			`start_date = params.get("start_date", DEFAULT_START_DATE)`
			`split_date = params.get("split_date", DEFAULT_SPLIT_DATE)`
			`pd1_name = params.get("pd1_name", DEFAULT_PD1_NAME)`
			`pd2_name = params.get("pd2_name", DEFAULT_PD2_NAME)`

			`run_dir = os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))`

			`try:`
			`from analysis import run_analysis`
			`docx_path = run_analysis(`
			`EVENTS_FILE, start_date, split_date, pd1_name, pd2_name, run_dir`
			`)`
			`return send_file(`
			`docx_path, as_attachment=True,`
			`download_name="SHEQ_PD_Comparison.docx",`
			`)`
			`except Exception as e:`
			`log.exception("Legacy report generation failed")`
			`return jsonify({"success": False, "error": str(e)}), 500`


			`# ─────────────────────────────────────────────────────────────────────────────`
			`# Entry point`
			`# ─────────────────────────────────────────────────────────────────────────────`

			`if __name__ == "__main__":`
			`os.makedirs(OUTPUT_DIR, exist_ok=True)`
			`log.info("SHEQ Analysis Tool starting on http://localhost:5000")`
			`log.info(" Events: %s", EVENTS_FILE)`
			`log.info(" Safety Energy: %s", SAFETY_ENERGY_FILE)`
			`log.info(" LLC Data: %s", LLC_FILE)`
			`app.run(debug=True, port=5000)`