update reader pdf by page
This commit is contained in:
parent
fb8a7b96e7
commit
90b7351d9b
4
main.py
4
main.py
|
|
@ -225,7 +225,7 @@ async def server_status():
|
||||||
|
|
||||||
|
|
||||||
@app.post("/upload")
|
@app.post("/upload")
|
||||||
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(None)):
|
async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form("")):
|
||||||
fname = file.filename
|
fname = file.filename
|
||||||
ext = os.path.splitext(fname)[1].lower()
|
ext = os.path.splitext(fname)[1].lower()
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
|
|
@ -246,7 +246,7 @@ async def upload_file(file: UploadFile = File(...), page: Optional[str] = Form(N
|
||||||
elif ext == ".xlsx":
|
elif ext == ".xlsx":
|
||||||
df = read_csv(str(tmp_path))
|
df = read_csv(str(tmp_path))
|
||||||
elif ext == ".pdf":
|
elif ext == ".pdf":
|
||||||
tbl = read_pdf(tmp_path)
|
tbl = read_pdf(tmp_path, page)
|
||||||
if len(tbl) == 0:
|
if len(tbl) == 0:
|
||||||
response = {
|
response = {
|
||||||
"message": "Tidak ditemukan tabel valid",
|
"message": "Tidak ditemukan tabel valid",
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from services.filter_column import filter_geo_admin_column
|
||||||
|
|
||||||
def is_number(s):
|
def is_number(s):
|
||||||
if s is None:
|
if s is None:
|
||||||
|
|
@ -144,47 +145,82 @@ def cleaning_column(headers, bodies):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page_selection(selectedPage: str, total_pages: int):
|
||||||
|
if not selectedPage:
|
||||||
|
return list(range(1, total_pages + 1))
|
||||||
|
|
||||||
def read_pdf(path: str):
|
pages = set()
|
||||||
|
parts = re.split(r'[,\s]+', selectedPage.strip())
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
if '-' in part:
|
||||||
|
try:
|
||||||
|
start, end = map(int, part.split('-'))
|
||||||
|
pages.update(range(start, end + 1))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
pages.add(int(part))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
|
||||||
|
return valid_pages
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def read_pdf(path: str, page: str):
|
||||||
pdf_path = path
|
pdf_path = path
|
||||||
|
selectedPage = page
|
||||||
tables_data = []
|
tables_data = []
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
page = pdf.pages[0]
|
total_pages = len(pdf.pages)
|
||||||
tables = page.find_tables()
|
selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||||
for i, t in enumerate(tables, start=1):
|
|
||||||
table = t.extract()
|
|
||||||
if len(table) > 4:
|
|
||||||
tables_data.append(table)
|
|
||||||
|
|
||||||
print(f"\nTotal tabel valid: {len(tables_data)}\n")
|
print(f"[INFO] Total halaman PDF: {total_pages}")
|
||||||
|
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||||
|
|
||||||
header_only = []
|
for page_num in selected_pages:
|
||||||
body_only = []
|
pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
|
||||||
for tbl in tables_data:
|
tables = pdf_page.find_tables()
|
||||||
head, body = detect_header_rows(tbl)
|
print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||||
header_only.append(head)
|
|
||||||
body_only.append(body)
|
|
||||||
|
|
||||||
clean_header = []
|
for t in tables:
|
||||||
for h in header_only:
|
table = t.extract()
|
||||||
clean_header.append(merge_multiline_header(h))
|
if len(table) > 2:
|
||||||
|
tables_data.append(table)
|
||||||
|
|
||||||
clean_body=[]
|
print(f"\nTotal tabel valid: {len(tables_data)}\n")
|
||||||
for i, raw_body in enumerate(body_only):
|
|
||||||
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
|
||||||
cleaned = cleaning_column(clean_header[i], [con_body])
|
|
||||||
clean_body.append(cleaned[0])
|
|
||||||
|
|
||||||
parsed = []
|
header_only = []
|
||||||
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
body_only = []
|
||||||
parsed.append({
|
for tbl in tables_data:
|
||||||
"title": str(i),
|
head, body = detect_header_rows(tbl)
|
||||||
"columns": cols,
|
header_only.append(head)
|
||||||
"rows": rows
|
body_only.append(body)
|
||||||
})
|
|
||||||
|
|
||||||
|
clean_header = []
|
||||||
|
for h in header_only:
|
||||||
|
clean_header.append(merge_multiline_header(h))
|
||||||
|
|
||||||
return parsed
|
clean_body=[]
|
||||||
|
for i, raw_body in enumerate(body_only):
|
||||||
|
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||||
|
cleaned = cleaning_column(clean_header[i], [con_body])
|
||||||
|
clean_body.append(cleaned[0])
|
||||||
|
|
||||||
|
parsed = []
|
||||||
|
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||||
|
parsed.append({
|
||||||
|
"title": str(i),
|
||||||
|
"columns": cols,
|
||||||
|
"rows": rows
|
||||||
|
})
|
||||||
|
|
||||||
|
clean_parsed = filter_geo_admin_column(parsed)
|
||||||
|
print(f"parsed{clean_parsed}")
|
||||||
|
return clean_parsed
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user