# parser.py # Utilities to extract text from the PDF and parse rows into records. # Assumes two tables: first is Harbor A, second is Harbor B. from __future__ import annotations from csv import reader import re from typing import List, Dict, Any, Tuple # Optional: If you want OCR fallback later, wire in pdf2image + pytesseract here. def extract_text_lines(pdf_path: str) -> List[str]: """Extract text lines from a PDF using pdfplumber (preferred) with a light fallback to PyPDF2. Returns a list of raw lines.""" text = "" try: import pdfplumber with pdfplumber.open(pdf_path) as pdf: pages_text = [] for p in pdf.pages: t = p.extract_text() or "" pages_text.append(t) text = "\n".join(pages_text) except Exception: try: from PyPDF2 import PdfReader reader = PdfReader(pdf_path) pages_text = [] for page in reader.pages: pages_text.append(page.extract_text() or "") text = "\n".join(pages_text) except Exception: text = "" if not text.strip(): raise RuntimeError("No text extracted. If the PDF is scanned, add OCR fallback (pytesseract).") # Normalize to individual lines lines = [ln.strip() for ln in text.splitlines()] return lines HEADER_PATTERNS = [ re.compile(r"\bSchiff\b.*\bETA\b.*\bETS\b", re.IGNORECASE), re.compile(r"Nächster Hafen|Liegeplatz|Ladung|Lotse", re.IGNORECASE), ] DATE_TOKEN = re.compile(r"\b\d{1,2}\.\d{1,2}\.(?:\d{4})?") TIME_FRAGMENT = r"(?:\s*/\s*\d{1,2}\.\d{2}\s*Uhr\s*\*?)?" DT_TOKEN_WITH_TIME = re.compile(r"\d{1,2}\.\d{1,2}\.(?:\d{4})?" + TIME_FRAGMENT) def cleanse_lines(lines: List[str]) -> List[str]: """Remove known header lines and keep data/blank lines.""" out: List[str] = [] for ln in lines: if not ln: out.append("") continue if any(p.search(ln) for p in HEADER_PATTERNS): continue out.append(ln) return out def split_into_tables(lines: List[str]) -> List[List[str]]: """Find candidate data lines (those containing a date token) and split them into up to two blocks separated by at least one blank line. Returns a list of blocks (1 or 2).""" candidate = [ln for ln in lines if (ln == "" or DATE_TOKEN.search(ln))] blocks: List[List[str]] = [] current: List[str] = [] seen_data = False for ln in candidate: if ln == "": if seen_data and current: blocks.append(current) current = [] seen_data = False continue current.append(ln) seen_data = True if current: blocks.append(current) if len(blocks) > 2: # Merge any extra blocks into the second blocks = [blocks[0], sum(blocks[1:], [])] return blocks def parse_line_to_record(ln: str) -> Dict[str, Any]: """Parse a single table line into a minimal record. Output fields: - ship: text before the first date token - eta_raw: 1st date(+optional time) token as raw string - ets_raw: 2nd date(+optional time) token as raw string (if present) - notes: remainder of the line after the last extracted date token - raw_line: the full original line """ # Ship name up to the first date token first = DATE_TOKEN.search(ln) ship = ln[: first.start()].strip() if first else ln.strip() # Extract up to two date(+time) tokens dt_tokens = DT_TOKEN_WITH_TIME.findall(ln) eta_raw = dt_tokens[0].strip() if len(dt_tokens) >= 1 else None ets_raw = dt_tokens[1].strip() if len(dt_tokens) >= 2 else None # Notes: everything after the last date token we captured notes = "" if dt_tokens: last_match = None it = DT_TOKEN_WITH_TIME.finditer(ln) for last_match in it: pass if last_match: notes = ln[last_match.end() :].strip() return { "ship": ship, "eta_raw": eta_raw, "ets_raw": ets_raw, "notes": notes, "raw_line": ln, } def parse_pdf_to_records(pdf_path: str) -> List[Dict[str, Any]]: """High-level: extract lines, cleanse headers, split into 1–2 tables, tag as harbor A/B by order, parse rows → records.""" lines = extract_text_lines(pdf_path) clean = cleanse_lines(lines) blocks = split_into_tables(clean) records: List[Dict[str, Any]] = [] for i, block in enumerate(blocks): harbor = "A" if i == 0 else "B" for ln in block: if not ln.strip(): continue rec = parse_line_to_record(ln) rec["harbor"] = harbor records.append(rec) return records