update reader pdf by page

This commit is contained in:
dmsanhrProject 2025-11-04 14:22:18 +07:00
parent fb8a7b96e7
commit 90b7351d9b
2 changed files with 68 additions and 32 deletions

View File

@ -225,7 +225,7 @@ async def server_status():
@app.post("/upload") @app.post("/upload")
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(None)): async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
fname = file.filename fname = file.filename
ext = os.path.splitext(fname)[1].lower() ext = os.path.splitext(fname)[1].lower()
contents = await file.read() contents = await file.read()
@ -246,7 +246,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(N
elif ext == ".xlsx": elif ext == ".xlsx":
df = read_csv(str(tmp_path)) df = read_csv(str(tmp_path))
elif ext == ".pdf": elif ext == ".pdf":
tbl = read_pdf(tmp_path) tbl = read_pdf(tmp_path, page)
if len(tbl) == 0: if len(tbl) == 0:
response = { response = {
"message": "Tidak ditemukan tabel valid", "message": "Tidak ditemukan tabel valid",

View File

@ -1,6 +1,7 @@
import pdfplumber import pdfplumber
import re import re
import pandas as pd import pandas as pd
from services.filter_column import filter_geo_admin_column
def is_number(s): def is_number(s):
if s is None: if s is None:
@ -144,47 +145,82 @@ def cleaning_column(headers, bodies):
def parse_page_selection(selectedPage: str, total_pages: int):
if not selectedPage:
return list(range(1, total_pages + 1))
def read_pdf(path: str): pages = set()
parts = re.split(r'[,\s]+', selectedPage.strip())
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
pages.update(range(start, end + 1))
except ValueError:
continue
else:
try:
pages.add(int(part))
except ValueError:
continue
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
return valid_pages
def read_pdf(path: str, page: str):
pdf_path = path pdf_path = path
selectedPage = page
tables_data = [] tables_data = []
with pdfplumber.open(pdf_path) as pdf: with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0] total_pages = len(pdf.pages)
tables = page.find_tables() selected_pages = parse_page_selection(selectedPage, total_pages)
for i, t in enumerate(tables, start=1):
table = t.extract()
if len(table) > 4:
tables_data.append(table)
print(f"\nTotal tabel valid: {len(tables_data)}\n") print(f"[INFO] Total halaman PDF: {total_pages}")
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
header_only = [] for page_num in selected_pages:
body_only = [] pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
for tbl in tables_data: tables = pdf_page.find_tables()
head, body = detect_header_rows(tbl) print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
header_only.append(head)
body_only.append(body)
clean_header = [] for t in tables:
for h in header_only: table = t.extract()
clean_header.append(merge_multiline_header(h)) if len(table) > 2:
tables_data.append(table)
clean_body=[] print(f"\nTotal tabel valid: {len(tables_data)}\n")
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = [] header_only = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): body_only = []
parsed.append({ for tbl in tables_data:
"title": str(i), head, body = detect_header_rows(tbl)
"columns": cols, header_only.append(head)
"rows": rows body_only.append(body)
})
clean_header = []
for h in header_only:
clean_header.append(merge_multiline_header(h))
return parsed clean_body=[]
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
parsed.append({
"title": str(i),
"columns": cols,
"rows": rows
})
clean_parsed = filter_geo_admin_column(parsed)
print(f"parsed{clean_parsed}")
return clean_parsed