amex_parser.py

"""
AMEX NZ PDF Statement Parser

Handles the Airpoints Platinum Card statement format where transactions
appear as: DD.MM.YY  MERCHANT NAME  [LOCATION]  AMOUNT [CR]
"""

import re
import pdfplumber
from dataclasses import dataclass, field
from typing import Optional

DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')
AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')
AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$')

SKIP_PATTERNS = [
    r'^MATTHEW BRUCE COHEN',
    r'^XXXX-XXXXXX-\d+',
    r'^Page \d+ / \d+',
    r'^Details\s+Foreign Spending',
    r'^Amount \$',
    r'^Prepared for',
    r'^Membership Number',
    r'^Opening Date',
    r'^Closing Date',
    r'^Airpoints Platinum Card',
    r'^Statement',
    r'^americanexpress',
    r'^Please check all',
    r'^Total of New Transactions',
    r'^Opening Balance',
    r'^Credit Summary',
    r'^Current Rate of Interest',
    r'^Statement Period',
    r'^Annual Rate',
    r'^Credit Limit',
    r'^\d+\.\d{2}\s*[-+]',   # summary balance line
    r'^Minimum Payment',
    r'^Due by',
    r'^NZD \d',               # foreign currency note
    r'^\d+\.\d{2} UNITED STATES',
    r'^DOLLAR',
    r'^NZD \d+\.\d{2} includes',
    r'^\.\.\.',
    r'^If you',
    r'^Please pay',
    r'^Visit www',
    r'^balance\.',
    r'^interest\.',
    r'^American Express',
    r'^Incorporated',
    r'^PO Box',
    r'^Auckland',
    r'^New Zealand',
]

SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]


@dataclass
class Transaction:
    date: str           # DD.MM.YY
    description: str
    amount: float
    is_credit: bool


def _should_skip(line: str) -> bool:
    for pattern in SKIP_RES:
        if pattern.match(line):
            return True
    return False


def _parse_amount(text: str) -> tuple[Optional[float], bool]:
    """Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""
    text = text.strip()
    m = AMOUNT_RE.match(text)
    if m:
        amount = float(m.group(1).replace(',', ''))
        is_credit = bool(m.group(2))
        return amount, is_credit
    return None, False


def _extract_lines(page) -> list[str]:
    """
    Use word bounding boxes to reconstruct lines.
    Groups words by Y-position and sorts each group by X.
    Amounts (rightmost column) are appended at end of line with a tab separator.
    """
    words = page.extract_words(x_tolerance=5, y_tolerance=4)
    if not words:
        return []

    # Determine amount-column threshold (rightmost ~22% of page)
    page_width = page.width
    amount_threshold = page_width * 0.78

    # Group words by rounded Y position
    buckets: dict[int, dict] = {}
    for w in words:
        key = round(w['top'] / 4) * 4
        if key not in buckets:
            buckets[key] = {'left': [], 'right': []}
        if w['x0'] >= amount_threshold:
            buckets[key]['right'].append((w['x0'], w['text']))
        else:
            buckets[key]['left'].append((w['x0'], w['text']))

    lines = []
    for y in sorted(buckets):
        left = ' '.join(t for _, t in sorted(buckets[y]['left']))
        right = ' '.join(t for _, t in sorted(buckets[y]['right']))
        line = left
        if right:
            line = f"{left}\t{right}" if left else right
        lines.append(line.strip())

    return [l for l in lines if l]


def parse_statement(pdf_path: str) -> list[Transaction]:
    """Parse all transactions from an AMEX NZ PDF statement."""
    transactions: list[Transaction] = []
    pending: Optional[dict] = None

    def commit(tx):
        if tx and tx.get('amount') is not None:
            transactions.append(Transaction(
                date=tx['date'],
                description=tx['description'].strip(),
                amount=tx['amount'],
                is_credit=tx['is_credit'],
            ))

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for raw_line in _extract_lines(page):
                # Split into description part and amount part (tab-separated)
                if '\t' in raw_line:
                    desc_part, amount_part = raw_line.split('\t', 1)
                else:
                    desc_part, amount_part = raw_line, ''

                desc_part = desc_part.strip()
                amount_part = amount_part.strip()

                if _should_skip(desc_part) and not amount_part:
                    continue

                # Check if line starts a new transaction
                date_match = DATE_RE.match(desc_part)
                if date_match:
                    commit(pending)
                    date = date_match.group(1)
                    remainder = date_match.group(2).strip()

                    # Amount might be inline in the description part
                    inline = AMOUNT_INLINE_RE.match(remainder)
                    if inline:
                        description = inline.group(1).strip()
                        amount = float(inline.group(2).replace(',', ''))
                        is_credit = bool(inline.group(3))
                    else:
                        description = remainder
                        amount = None
                        is_credit = False

                    # Check right-column amount
                    if amount_part:
                        a, c = _parse_amount(amount_part)
                        if a is not None:
                            amount, is_credit = a, c

                    pending = {
                        'date': date,
                        'description': description,
                        'amount': amount,
                        'is_credit': is_credit,
                    }

                elif pending:
                    # Continuation line — could be amount or CR
                    if amount_part and pending['amount'] is None:
                        a, c = _parse_amount(amount_part)
                        if a is not None:
                            pending['amount'] = a
                            pending['is_credit'] = c

                    # Plain amount on its own line (no tab split)
                    stripped = desc_part.strip()
                    if not amount_part:
                        a, c = _parse_amount(stripped)
                        if a is not None and pending['amount'] is None:
                            pending['amount'] = a
                            pending['is_credit'] = c
                        elif stripped == 'CR':
                            pending['is_credit'] = True

        commit(pending)

    return transactions
Initial commit — AmexPal statement analyser 2026-03-26 08:37:01 +13:00			`"""`
			`AMEX NZ PDF Statement Parser`

			`Handles the Airpoints Platinum Card statement format where transactions`
			`appear as: DD.MM.YY MERCHANT NAME [LOCATION] AMOUNT [CR]`
			`"""`

			`import re`
			`import pdfplumber`
			`from dataclasses import dataclass, field`
			`from typing import Optional`

			`DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')`
			`AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')`
			`AMOUNT_INLINE_RE = re.compile(r'^(.?)\s+([\d,]+\.\d{2})\s(CR)?$')`

			`SKIP_PATTERNS = [`
			`r'^MATTHEW BRUCE COHEN',`
			`r'^XXXX-XXXXXX-\d+',`
			`r'^Page \d+ / \d+',`
			`r'^Details\s+Foreign Spending',`
			`r'^Amount \$',`
			`r'^Prepared for',`
			`r'^Membership Number',`
			`r'^Opening Date',`
			`r'^Closing Date',`
			`r'^Airpoints Platinum Card',`
			`r'^Statement',`
			`r'^americanexpress',`
			`r'^Please check all',`
			`r'^Total of New Transactions',`
			`r'^Opening Balance',`
			`r'^Credit Summary',`
			`r'^Current Rate of Interest',`
			`r'^Statement Period',`
			`r'^Annual Rate',`
			`r'^Credit Limit',`
			`r'^\d+\.\d{2}\s*[-+]', # summary balance line`
			`r'^Minimum Payment',`
			`r'^Due by',`
			`r'^NZD \d', # foreign currency note`
			`r'^\d+\.\d{2} UNITED STATES',`
			`r'^DOLLAR',`
			`r'^NZD \d+\.\d{2} includes',`
			`r'^\.\.\.',`
			`r'^If you',`
			`r'^Please pay',`
			`r'^Visit www',`
			`r'^balance\.',`
			`r'^interest\.',`
			`r'^American Express',`
			`r'^Incorporated',`
			`r'^PO Box',`
			`r'^Auckland',`
			`r'^New Zealand',`
			`]`

			`SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]`


			`@dataclass`
			`class Transaction:`
			`date: str # DD.MM.YY`
			`description: str`
			`amount: float`
			`is_credit: bool`


			`def _should_skip(line: str) -> bool:`
			`for pattern in SKIP_RES:`
			`if pattern.match(line):`
			`return True`
			`return False`


			`def _parse_amount(text: str) -> tuple[Optional[float], bool]:`
			`"""Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""`
			`text = text.strip()`
			`m = AMOUNT_RE.match(text)`
			`if m:`
			`amount = float(m.group(1).replace(',', ''))`
			`is_credit = bool(m.group(2))`
			`return amount, is_credit`
			`return None, False`


			`def _extract_lines(page) -> list[str]:`
			`"""`
			`Use word bounding boxes to reconstruct lines.`
			`Groups words by Y-position and sorts each group by X.`
			`Amounts (rightmost column) are appended at end of line with a tab separator.`
			`"""`
			`words = page.extract_words(x_tolerance=5, y_tolerance=4)`
			`if not words:`
			`return []`

			`# Determine amount-column threshold (rightmost ~22% of page)`
			`page_width = page.width`
			`amount_threshold = page_width * 0.78`

			`# Group words by rounded Y position`
			`buckets: dict[int, dict] = {}`
			`for w in words:`
			`key = round(w['top'] / 4) * 4`
			`if key not in buckets:`
			`buckets[key] = {'left': [], 'right': []}`
			`if w['x0'] >= amount_threshold:`
			`buckets[key]['right'].append((w['x0'], w['text']))`
			`else:`
			`buckets[key]['left'].append((w['x0'], w['text']))`

			`lines = []`
			`for y in sorted(buckets):`
			`left = ' '.join(t for _, t in sorted(buckets[y]['left']))`
			`right = ' '.join(t for _, t in sorted(buckets[y]['right']))`
			`line = left`
			`if right:`
			`line = f"{left}\t{right}" if left else right`
			`lines.append(line.strip())`

			`return [l for l in lines if l]`


			`def parse_statement(pdf_path: str) -> list[Transaction]:`
			`"""Parse all transactions from an AMEX NZ PDF statement."""`
			`transactions: list[Transaction] = []`
			`pending: Optional[dict] = None`

			`def commit(tx):`
			`if tx and tx.get('amount') is not None:`
			`transactions.append(Transaction(`
			`date=tx['date'],`
			`description=tx['description'].strip(),`
			`amount=tx['amount'],`
			`is_credit=tx['is_credit'],`
			`))`

			`with pdfplumber.open(pdf_path) as pdf:`
			`for page in pdf.pages:`
			`for raw_line in _extract_lines(page):`
			`# Split into description part and amount part (tab-separated)`
			`if '\t' in raw_line:`
			`desc_part, amount_part = raw_line.split('\t', 1)`
			`else:`
			`desc_part, amount_part = raw_line, ''`

			`desc_part = desc_part.strip()`
			`amount_part = amount_part.strip()`

			`if _should_skip(desc_part) and not amount_part:`
			`continue`

			`# Check if line starts a new transaction`
			`date_match = DATE_RE.match(desc_part)`
			`if date_match:`
			`commit(pending)`
			`date = date_match.group(1)`
			`remainder = date_match.group(2).strip()`

			`# Amount might be inline in the description part`
			`inline = AMOUNT_INLINE_RE.match(remainder)`
			`if inline:`
			`description = inline.group(1).strip()`
			`amount = float(inline.group(2).replace(',', ''))`
			`is_credit = bool(inline.group(3))`
			`else:`
			`description = remainder`
			`amount = None`
			`is_credit = False`

			`# Check right-column amount`
			`if amount_part:`
			`a, c = _parse_amount(amount_part)`
			`if a is not None:`
			`amount, is_credit = a, c`

			`pending = {`
			`'date': date,`
			`'description': description,`
			`'amount': amount,`
			`'is_credit': is_credit,`
			`}`

			`elif pending:`
			`# Continuation line — could be amount or CR`
			`if amount_part and pending['amount'] is None:`
			`a, c = _parse_amount(amount_part)`
			`if a is not None:`
			`pending['amount'] = a`
			`pending['is_credit'] = c`

			`# Plain amount on its own line (no tab split)`
			`stripped = desc_part.strip()`
			`if not amount_part:`
			`a, c = _parse_amount(stripped)`
			`if a is not None and pending['amount'] is None:`
			`pending['amount'] = a`
			`pending['is_credit'] = c`
			`elif stripped == 'CR':`
			`pending['is_credit'] = True`

			`commit(pending)`

			`return transactions`