From bd1f6263ca57f67d063da7bfb1abd07382bcc1ad Mon Sep 17 00:00:00 2001 From: dmsanhrProject Date: Thu, 30 Oct 2025 11:12:57 +0700 Subject: [PATCH] cleaning reader_pdf --- .DS_Store | Bin 6148 -> 6148 bytes services/reader_pdf.py | 24 +----------------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/.DS_Store b/.DS_Store index 23ffbc084973f348acd719362eb32dd03072ef05..02daf350c3ae76dd0bbb5a1e8ae6e51b038e63cb 100644 GIT binary patch delta 98 zcmZoMXfc@JFU-Zjz`)4BAi%(onp9p~kd%|3v{{g2B{M%rf|VheArXi(7*Z#Tur_cQ n8X4&*n3|YRp2KP{jwD$YT$GoSpO+3YYcmfEKl5gGj=%f>9KaRz delta 48 zcmZoMXfc@JFUrBdz`)4BAi&_6lb@WFlb;0S3v3qTSkBDIvH1rJKl8+fH=Efx{_+C= D4j~M; diff --git a/services/reader_pdf.py b/services/reader_pdf.py index a5a5241..5001744 100644 --- a/services/reader_pdf.py +++ b/services/reader_pdf.py @@ -77,10 +77,6 @@ def merge_multiline_header(header_rows): val = str(val).replace('\n', ' ').strip() final_header.append(val) final_header = [v for v in final_header if v not in ['', None]] - - # header_string = ' | '.join(final_header) - # return header_string - return final_header @@ -177,22 +173,12 @@ def read_pdf(path: str): for i, raw_body in enumerate(body_only): con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] cleaned = cleaning_column(clean_header[i], [con_body]) - # clean_body.append(con_body) clean_body.append(cleaned[0]) - # print(clean_header) - # print(clean_body) - parsed = [] - # for cols, rows in zip(clean_header, clean_body): - # parsed.append({ - # "title": "", - # "columns": cols, - # "rows": rows - # }) for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): parsed.append({ - "title": str(i), # bisa juga f"Table {i}" kalau mau format tertentu + "title": str(i), "columns": cols, "rows": rows }) @@ -206,28 +192,20 @@ def read_pdf(path: str): def convert_df(payload): - # Validasi dasar - print(f'payload {payload}') - - # Cek apakah keys ada if "columns" not in payload or "rows" not in payload: raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") - # Pastikan columns dan rows berupa list if not isinstance(payload["columns"], list): raise TypeError("'columns' harus berupa list.") if not isinstance(payload["rows"], list): raise TypeError("'rows' harus berupa list.") - # Pastikan setiap baris punya jumlah kolom yang sama for i, row in enumerate(payload["rows"]): if len(row) != len(payload["columns"]): raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") - # Konversi menjadi DataFrame df = pd.DataFrame(payload["rows"], columns=payload["columns"]) - # Tambahkan atribut title kalau ada if "title" in payload: df.attrs["title"] = payload["title"]