amexpal/amex_analyser.py

"""
AMEX Statement Analyser — categorisation, insights, monthly/weekly breakdown.
"""

import re
from collections import defaultdict
from datetime import datetime
from amex_parser import Transaction

# ── Payment processor prefixes to strip before categorising / displaying ─────
# WINDCAVE* is a common NZ payment gateway used by many merchants
_NOISE_PREFIX_RE = re.compile(
    r'^(WINDCAVE\*|SQ \*|SP |PAYPAL \*|PAYPAL\*)',
    re.IGNORECASE,
)

def normalise(description: str) -> str:
    """Strip payment processor prefixes for cleaner display and matching."""
    return _NOISE_PREFIX_RE.sub('', description).strip()


# ── Category rules (checked in order — first match wins) ─────────────────────
# IMPORTANT: Fuel is before Public Transport so petrol stations don't fall into Public Transport.
CATEGORY_RULES: list[tuple[str, list[str]]] = [
    ('Payments', [
        'PAYMENT - THANK YOU',
    ]),
    ('Groceries', [
        'NEW WORLD', 'COUNTDOWN', 'WOOLWORTHS', 'PAK N SAVE', 'PAKNSAVE',
        'FOUR SQUARE', 'FRESH CHOICE', 'HARRIS FARM', 'MOORE WILSON',
        'FARRO', 'NOSH', 'HUCKLEBERRY', 'COMMONSENSE',
    ]),
    ('Dining & Takeaway', [
        'MCDONALD', 'UBER EATS', 'ST PIERRE', 'KFC', 'UMU PIZZA',
        'DOMINO', 'WILDFLOUR', 'SUBWAY', 'BURGER KING', 'PIZZA HUT',
        'NOODLE', 'SUSHI', 'RESTAURANT', 'CAFE', 'BAKERY', 'COFFEE',
        'STARBUCKS', 'MUFFIN BREAK', 'THAI', 'INDIAN', 'CHINESE',
        'KEBAB', 'TAKEAWAY', 'PITA PIT', 'OPORTO', "WENDY'S", 'GRILL',
        'BISTRO', 'BARBEQUE', 'BBQ', 'EGGS', 'PANCAKE',
    ]),
    ('Fuel', [
        'MOBIL', 'BP ', 'Z ENERGY', 'CALTEX', 'GULL ', 'CHALLENGE ',
        'NPD ', 'WAITOMO ', 'NIGHT N DAY',
    ]),
    ('Public Transport', [
        'AT HOP', 'AT METRO', 'AUCKLAND TRANSPORT', 'AUCKLAND COUNCIL TR',
        'PUBLIC TRANSPORT', 'INTERCITY', 'NAKED BUS', 'MANA COACH',
        'TRANZ METRO', 'RITCHIES', 'GO BUS', 'NZ BUS',
        'SNAPPER', 'METLINK', 'EBUS', 'ORBITER',
        'UBER TRIP', 'UBER*TRIP',
    ]),
    ('Utilities', [
        'SPARK', 'SLINGSHOT', 'POWERSHOP', 'CONTACT ENERGY', 'MERCURY ',
        'GENESIS ', 'WATERCARE', 'CHORUS', 'VODAFONE', 'TWO DEGREES',
        '2DEGREES', 'SKINNY', 'TRUSTPOWER', 'ORCON',
    ]),
    ('Subscriptions', [
        'GOOGLE', 'APPLE.COM', 'EMBY', 'MYOB', 'NETFLIX', 'SPOTIFY',
        'MICROSOFT', 'ADOBE', 'DROPBOX', 'AMAZON PRIME', 'DISNEY',
        'SKYDRIVE', 'YOUTUBE', 'ICLOUD', 'GITHUB', 'CANVA', 'XERO',
        'ZOOM', 'SLACK', 'ATLASSIAN', '1PASSWORD', 'LASTPASS', 'NORDVPN',
        'PARAMOUNT', 'BINGE', 'NEON ', 'LIGHTROOM',
    ]),
    ('Health & Pharmacy', [
        'CHEMIST WAREHOUSE', 'PHARMACY', 'UNICHEM', 'LIFE PHARMACY',
        'GREEN CROSS', 'DOCTOR', 'MEDICAL', 'DENTAL', 'DENTIST',
        'OPTOMETRIST', 'PHYSIO', 'HEALTHZONE', 'HEALTH FOOD',
        'SPECSAVERS', 'VISION',
    ]),
    ('Home & Hardware', [
        'BUNNINGS', 'MITRE 10', 'PLACEMAKERS', 'WISELIVING',
        'IKEA', 'FREEDOM ', 'SPOTLIGHT', 'BED BATH', 'ADAIRS',
        'BRISCOES', 'STEVENS ', 'LIVING AND GIVING', 'KMART HOMEWARE',
        'BABY FACTORY', 'BABY CITY',
    ]),
    ('Electronics & Appliances', [
        'HARVEY NORMAN', 'NOEL LEEMING', 'JB HI-FI', 'THE GOOD GUYS',
        'PB TECH', 'COMPUTER LOUNGE', 'MIGHTY APE', 'PLAYTECH',
    ]),
    ('Shopping & Apparel', [
        'KMART', 'THE WAREHOUSE', 'WAREHOUSE STATIONERY', 'FARMERS ',
        'COTTON ON', 'GLASSONS', 'HALLENSTEINS', 'KATHMANDU', 'REBEL ',
        'FOOTLOCKER', 'POSTIE', 'STIRLING SPORTS', 'LULULEMON', 'EZIBUY',
        'WHITCOULLS', 'PAPER PLUS', 'SMITHS CITY', 'HANNAHS',
        'NUMBER ONE SHOES', 'SHOE WAREHOUSE',
    ]),
    ('Entertainment', [
        'HOYTS', 'READING CINEMA', 'EVENT CINEMA', 'TICKETEK',
        'TICKETMASTER', 'EVENTFINDA', 'SKY TV', 'TIMEZONE',
        'LASER STRIKE', 'ARCHIE BROTHERS', 'BOWLING', 'PAINTBALL',
    ]),
    ('Travel & Accommodation', [
        'AIRBNB', 'BOOKING.COM', 'HOTEL', 'MOTEL', 'HOSTEL',
        'AIR NEW ZEALAND', 'JETSTAR', 'QANTAS', 'VIRGIN AUSTRALIA',
        'SCENIC HOTEL', 'HOLIDAY INN', 'NOVOTEL', 'IBIS ',
        'TRIVAGO', 'EXPEDIA',
    ]),
    ('Personal Care', [
        'HAIRCUT', 'BARBER', ' SALON', 'DAY SPA', 'BEAUTY', 'LASER ',
        ' NAIL ', 'WAXING', 'MASSAGE',
    ]),
    ('Food & Specialty', [
        'SABATO', 'TANK ', 'ORIGIN COFFEE', 'ATOMIC COFFEE',
    ]),
    ('Pets', [
        'VET ', 'VETCARE', 'PETBARN', 'PET STOCK', 'ANIMATES',
        'HOLLYWOOD FISH', 'PETCO',
    ]),
]

CATEGORY_ICONS = {
    'Groceries':               '🛒',
    'Dining & Takeaway':       '🍔',
    'Fuel':                    '⛽',
    'Public Transport':        '🚌',
    'Utilities':               '⚡',
    'Subscriptions':           '📱',
    'Health & Pharmacy':       '💊',
    'Home & Hardware':         '🏠',
    'Electronics & Appliances':'💻',
    'Shopping & Apparel':      '🛍️',
    'Entertainment':           '🎬',
    'Travel & Accommodation':  '✈️',
    'Personal Care':           '💅',
    'Food & Specialty':        '🍷',
    'Pets':                    '🐾',
    'Other':                   '📦',
    'Payments':                '💳',
}


def categorise(description: str) -> str:
    # Normalise first so WINDCAVE*DOMINOS → DOMINOS → Dining & Takeaway
    clean = normalise(description).upper()
    for category, keywords in CATEGORY_RULES:
        for kw in keywords:
            if kw.upper() in clean:
                return category
    return 'Other'


def _parse_date(date_str: str) -> datetime | None:
    try:
        return datetime.strptime(date_str, '%d.%m.%y')
    except ValueError:
        return None


def _clean_merchant(description: str) -> str:
    """Normalised merchant name for grouping (strip noise prefix + location suffix)."""
    name = normalise(description)
    # Strip trailing store-number IDs like "9518", "8212"
    name = re.sub(r'\s+\d{3,6}$', '', name).strip()
    # Strip trailing all-caps location tokens that aren't the brand name
    parts = name.split()
    while len(parts) > 1 and parts[-1].isupper() and len(parts[-1]) <= 15:
        candidate = ' '.join(parts[:-1])
        if any(kw.upper() in candidate.upper() for _, kws in CATEGORY_RULES for kw in kws):
            parts = parts[:-1]
        else:
            break
    return ' '.join(parts)


def _short_merchant(description: str) -> str:
    """Short display name for insights."""
    name = normalise(description)
    name = name.split('HTTPS://')[0].strip()
    name = re.sub(r'\*\w+', '', name).strip()
    return name.title()


def monthly_breakdown(enriched: list[dict]) -> dict:
    month_data: dict[str, dict[str, float]] = defaultdict(lambda: defaultdict(float))
    month_sort: dict[str, str] = {}

    for tx in enriched:
        if tx['is_credit']:
            continue
        dt = _parse_date(tx['date'])
        if not dt:
            continue
        label = dt.strftime('%b %Y')
        sort_key = dt.strftime('%Y-%m')
        month_data[label][tx['category']] += tx['amount']
        month_sort[label] = sort_key

    months = sorted(month_data.keys(), key=lambda m: month_sort[m])
    all_cats = sorted(
        {c for m in months for c in month_data[m]},
        key=lambda c: -sum(month_data[m].get(c, 0) for m in months),
    )

    return {
        'months': months,
        'categories': all_cats,
        'by_month': {m: {c: round(month_data[m].get(c, 0), 2) for c in all_cats} for m in months},
        'totals': {m: round(sum(month_data[m].values()), 2) for m in months},
    }


def weekly_breakdown(enriched: list[dict]) -> dict:
    week_data: dict[str, float] = defaultdict(float)
    for tx in enriched:
        if tx['is_credit']:
            continue
        dt = _parse_date(tx['date'])
        if not dt:
            continue
        week_num = (dt.day - 1) // 7 + 1
        week_data[f'Week {week_num}'] += tx['amount']
    weeks = sorted(week_data.keys())
    return {'weeks': weeks, 'totals': {w: round(week_data[w], 2) for w in weeks}}


def generate_insights(enriched: list[dict], by_category: dict, total_spend: float) -> list[dict]:
    spend_txns = [t for t in enriched if not t['is_credit']]
    insights = []

    # Grocery trips
    grocery_txns = [t for t in spend_txns if t['category'] == 'Groceries']
    if grocery_txns:
        total = by_category.get('Groceries', 0)
        count = len(grocery_txns)
        avg = total / count if count else 0
        pct = (total / total_spend * 100) if total_spend else 0
        insights.append({
            'icon': '🛒', 'title': 'Grocery Shopping',
            'stat': f'${total:.2f}',
            'detail': f'{count} trips · avg ${avg:.0f} each · {pct:.0f}% of spend',
            'color': '#27ae60',
        })

    # Dining & takeaway
    dining_txns = [t for t in spend_txns if t['category'] == 'Dining & Takeaway']
    if dining_txns:
        total = by_category.get('Dining & Takeaway', 0)
        count = len(dining_txns)
        insights.append({
            'icon': '🍔', 'title': 'Dining & Takeaway',
            'stat': f'${total:.2f}',
            'detail': f'{count} orders this period',
            'color': '#e67e22',
        })

    # Subscriptions
    sub_txns = [t for t in spend_txns if t['category'] == 'Subscriptions']
    if sub_txns:
        total = by_category.get('Subscriptions', 0)
        names = list(dict.fromkeys(_short_merchant(t['description']) for t in sub_txns))
        insights.append({
            'icon': '📱', 'title': 'Subscriptions',
            'stat': f'${total:.2f}',
            'detail': ', '.join(names[:5]),
            'color': '#8e44ad',
        })

    # Utilities
    util_txns = [t for t in spend_txns if t['category'] == 'Utilities']
    if util_txns:
        total = by_category.get('Utilities', 0)
        names = list(dict.fromkeys(_short_merchant(t['description']) for t in util_txns))
        insights.append({
            'icon': '⚡', 'title': 'Utilities',
            'stat': f'${total:.2f}',
            'detail': ', '.join(names[:4]),
            'color': '#f39c12',
        })

    # Biggest single purchase
    if spend_txns:
        biggest = max(spend_txns, key=lambda t: t['amount'])
        insights.append({
            'icon': '💳', 'title': 'Largest Purchase',
            'stat': f'${biggest["amount"]:.2f}',
            'detail': _short_merchant(biggest['description']),
            'color': '#e74c3c',
        })

    # Daily average
    active_days = len({t['date'] for t in spend_txns})
    if active_days > 0 and total_spend > 0:
        insights.append({
            'icon': '📅', 'title': 'Daily Average',
            'stat': f'${total_spend / active_days:.2f}',
            'detail': f'Across {active_days} active spending days',
            'color': '#006fcf',
        })

    return insights


def analyse(enriched_transactions: list[dict] | None = None,
            transactions: list[Transaction] | None = None) -> dict:
    """Accept either pre-enriched dicts or raw Transaction objects."""
    if enriched_transactions is None:
        enriched_transactions = []
        for tx in (transactions or []):
            enriched_transactions.append({
                'date': tx.date,
                'description': normalise(tx.description),
                'amount': tx.amount,
                'is_credit': tx.is_credit,
                'category': categorise(tx.description),
            })

    spend_txns = [t for t in enriched_transactions if not t['is_credit']]
    credit_txns = [t for t in enriched_transactions if t['is_credit']]
    payment_txns = [t for t in credit_txns if t['category'] == 'Payments']

    total_spend = round(sum(t['amount'] for t in spend_txns), 2)
    total_credits = round(sum(t['amount'] for t in credit_txns), 2)
    total_payments = round(sum(t['amount'] for t in payment_txns), 2)

    by_category: dict[str, float] = defaultdict(float)
    for t in spend_txns:
        by_category[t['category']] += t['amount']
    by_category = {k: round(v, 2) for k, v in sorted(by_category.items(), key=lambda x: -x[1])}

    merchant_map: dict[str, dict] = defaultdict(lambda: {'total': 0.0, 'count': 0})
    for t in spend_txns:
        name = _clean_merchant(t['description'])
        merchant_map[name]['total'] += t['amount']
        merchant_map[name]['count'] += 1
    top_merchants = sorted(
        [{'name': k, 'total': round(v['total'], 2), 'count': v['count']}
         for k, v in merchant_map.items()],
        key=lambda x: -x['total'],
    )[:10]

    monthly = monthly_breakdown(enriched_transactions)
    weekly = weekly_breakdown(enriched_transactions)
    insights = generate_insights(enriched_transactions, by_category, total_spend)

    # Stable numeric IDs for frontend list keying
    for i, tx in enumerate(enriched_transactions):
        tx.setdefault('id', i)

    return {
        'total_spend': total_spend,
        'total_credits': total_credits,
        'total_payments': total_payments,
        'transaction_count': len(enriched_transactions),
        'spend_count': len(spend_txns),
        'by_category': by_category,
        'category_icons': CATEGORY_ICONS,
        'top_merchants': top_merchants,
        'insights': insights,
        'monthly': monthly,
        'weekly': weekly,
        'transactions': enriched_transactions,
    }