"""
AMEX NZ PDF Statement Parser

Handles the Airpoints Platinum Card statement format where transactions
appear as: DD.MM.YY  MERCHANT NAME  [LOCATION]  AMOUNT [CR]
"""

import re
import pdfplumber
from dataclasses import dataclass, field
from typing import Optional

DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')
AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')
AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$')

SKIP_PATTERNS = [
    r'^MATTHEW BRUCE COHEN',
    r'^XXXX-XXXXXX-\d+',
    r'^Page \d+ / \d+',
    r'^Details\s+Foreign Spending',
    r'^Amount \$',
    r'^Prepared for',
    r'^Membership Number',
    r'^Opening Date',
    r'^Closing Date',
    r'^Airpoints Platinum Card',
    r'^Statement',
    r'^americanexpress',
    r'^Please check all',
    r'^Total of New Transactions',
    r'^Opening Balance',
    r'^Credit Summary',
    r'^Current Rate of Interest',
    r'^Statement Period',
    r'^Annual Rate',
    r'^Credit Limit',
    r'^\d+\.\d{2}\s*[-+]',   # summary balance line
    r'^Minimum Payment',
    r'^Due by',
    r'^NZD \d',               # foreign currency note
    r'^\d+\.\d{2} UNITED STATES',
    r'^DOLLAR',
    r'^NZD \d+\.\d{2} includes',
    r'^\.\.\.',
    r'^If you',
    r'^Please pay',
    r'^Visit www',
    r'^balance\.',
    r'^interest\.',
    r'^American Express',
    r'^Incorporated',
    r'^PO Box',
    r'^Auckland',
    r'^New Zealand',
]

SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]


@dataclass
class Transaction:
    date: str           # DD.MM.YY
    description: str
    amount: float
    is_credit: bool


def _should_skip(line: str) -> bool:
    for pattern in SKIP_RES:
        if pattern.match(line):
            return True
    return False


def _parse_amount(text: str) -> tuple[Optional[float], bool]:
    """Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""
    text = text.strip()
    m = AMOUNT_RE.match(text)
    if m:
        amount = float(m.group(1).replace(',', ''))
        is_credit = bool(m.group(2))
        return amount, is_credit
    return None, False


def _extract_lines(page) -> list[str]:
    """
    Use word bounding boxes to reconstruct lines.
    Groups words by Y-position and sorts each group by X.
    Amounts (rightmost column) are appended at end of line with a tab separator.
    """
    words = page.extract_words(x_tolerance=5, y_tolerance=4)
    if not words:
        return []

    # Determine amount-column threshold (rightmost ~22% of page)
    page_width = page.width
    amount_threshold = page_width * 0.78

    # Group words by rounded Y position
    buckets: dict[int, dict] = {}
    for w in words:
        key = round(w['top'] / 4) * 4
        if key not in buckets:
            buckets[key] = {'left': [], 'right': []}
        if w['x0'] >= amount_threshold:
            buckets[key]['right'].append((w['x0'], w['text']))
        else:
            buckets[key]['left'].append((w['x0'], w['text']))

    lines = []
    for y in sorted(buckets):
        left = ' '.join(t for _, t in sorted(buckets[y]['left']))
        right = ' '.join(t for _, t in sorted(buckets[y]['right']))
        line = left
        if right:
            line = f"{left}\t{right}" if left else right
        lines.append(line.strip())

    return [l for l in lines if l]


def parse_statement(pdf_path: str) -> list[Transaction]:
    """Parse all transactions from an AMEX NZ PDF statement."""
    transactions: list[Transaction] = []
    pending: Optional[dict] = None

    def commit(tx):
        if tx and tx.get('amount') is not None:
            transactions.append(Transaction(
                date=tx['date'],
                description=tx['description'].strip(),
                amount=tx['amount'],
                is_credit=tx['is_credit'],
            ))

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for raw_line in _extract_lines(page):
                # Split into description part and amount part (tab-separated)
                if '\t' in raw_line:
                    desc_part, amount_part = raw_line.split('\t', 1)
                else:
                    desc_part, amount_part = raw_line, ''

                desc_part = desc_part.strip()
                amount_part = amount_part.strip()

                if _should_skip(desc_part) and not amount_part:
                    continue

                # Check if line starts a new transaction
                date_match = DATE_RE.match(desc_part)
                if date_match:
                    commit(pending)
                    date = date_match.group(1)
                    remainder = date_match.group(2).strip()

                    # Amount might be inline in the description part
                    inline = AMOUNT_INLINE_RE.match(remainder)
                    if inline:
                        description = inline.group(1).strip()
                        amount = float(inline.group(2).replace(',', ''))
                        is_credit = bool(inline.group(3))
                    else:
                        description = remainder
                        amount = None
                        is_credit = False

                    # Check right-column amount
                    if amount_part:
                        a, c = _parse_amount(amount_part)
                        if a is not None:
                            amount, is_credit = a, c

                    pending = {
                        'date': date,
                        'description': description,
                        'amount': amount,
                        'is_credit': is_credit,
                    }

                elif pending:
                    # Continuation line — could be amount or CR
                    if amount_part and pending['amount'] is None:
                        a, c = _parse_amount(amount_part)
                        if a is not None:
                            pending['amount'] = a
                            pending['is_credit'] = c

                    # Plain amount on its own line (no tab split)
                    stripped = desc_part.strip()
                    if not amount_part:
                        a, c = _parse_amount(stripped)
                        if a is not None and pending['amount'] is None:
                            pending['amount'] = a
                            pending['is_credit'] = c
                        elif stripped == 'CR':
                            pending['is_credit'] = True

        commit(pending)

    return transactions