diff --git a/main.py b/main.py index 09501de..b6e5021 100644 --- a/main.py +++ b/main.py @@ -225,7 +225,7 @@ async def server_status(): @app.post("/upload") -async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(None)): +async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")): fname = file.filename ext = os.path.splitext(fname)[1].lower() contents = await file.read() @@ -246,7 +246,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(N elif ext == ".xlsx": df = read_csv(str(tmp_path)) elif ext == ".pdf": - tbl = read_pdf(tmp_path) + tbl = read_pdf(tmp_path, page) if len(tbl) == 0: response = { "message": "Tidak ditemukan tabel valid", diff --git a/services/reader_pdf.py b/services/reader_pdf.py index 5001744..0ed620e 100644 --- a/services/reader_pdf.py +++ b/services/reader_pdf.py @@ -1,6 +1,7 @@ import pdfplumber import re import pandas as pd +from services.filter_column import filter_geo_admin_column def is_number(s): if s is None: @@ -144,47 +145,82 @@ def cleaning_column(headers, bodies): +def parse_page_selection(selectedPage: str, total_pages: int): + if not selectedPage: + return list(range(1, total_pages + 1)) -def read_pdf(path: str): + pages = set() + parts = re.split(r'[,\s]+', selectedPage.strip()) + + for part in parts: + if '-' in part: + try: + start, end = map(int, part.split('-')) + pages.update(range(start, end + 1)) + except ValueError: + continue + else: + try: + pages.add(int(part)) + except ValueError: + continue + + valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] + return valid_pages + + + +def read_pdf(path: str, page: str): pdf_path = path + selectedPage = page tables_data = [] with pdfplumber.open(pdf_path) as pdf: - page = pdf.pages[0] - tables = page.find_tables() - for i, t in enumerate(tables, start=1): - table = t.extract() - if len(table) > 4: - tables_data.append(table) + total_pages = len(pdf.pages) + selected_pages = parse_page_selection(selectedPage, total_pages) - print(f"\nTotal tabel valid: {len(tables_data)}\n") + print(f"[INFO] Total halaman PDF: {total_pages}") + print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") - header_only = [] - body_only = [] - for tbl in tables_data: - head, body = detect_header_rows(tbl) - header_only.append(head) - body_only.append(body) + for page_num in selected_pages: + pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0 + tables = pdf_page.find_tables() + print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") - clean_header = [] - for h in header_only: - clean_header.append(merge_multiline_header(h)) + for t in tables: + table = t.extract() + if len(table) > 2: + tables_data.append(table) + + print(f"\nTotal tabel valid: {len(tables_data)}\n") - clean_body=[] - for i, raw_body in enumerate(body_only): - con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] - cleaned = cleaning_column(clean_header[i], [con_body]) - clean_body.append(cleaned[0]) + header_only = [] + body_only = [] + for tbl in tables_data: + head, body = detect_header_rows(tbl) + header_only.append(head) + body_only.append(body) - parsed = [] - for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): - parsed.append({ - "title": str(i), - "columns": cols, - "rows": rows - }) + clean_header = [] + for h in header_only: + clean_header.append(merge_multiline_header(h)) + clean_body=[] + for i, raw_body in enumerate(body_only): + con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] + cleaned = cleaning_column(clean_header[i], [con_body]) + clean_body.append(cleaned[0]) - return parsed + parsed = [] + for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): + parsed.append({ + "title": str(i), + "columns": cols, + "rows": rows + }) + + clean_parsed = filter_geo_admin_column(parsed) + print(f"parsed{clean_parsed}") + return clean_parsed