cleaning reader_pdf

This commit is contained in:
dmsanhrProject 2025-10-30 11:12:57 +07:00
parent 3a35401c1a
commit bd1f6263ca
2 changed files with 1 additions and 23 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -77,10 +77,6 @@ def merge_multiline_header(header_rows):
val = str(val).replace('\n', ' ').strip()
final_header.append(val)
final_header = [v for v in final_header if v not in ['', None]]
# header_string = ' | '.join(final_header)
# return header_string
return final_header
@ -177,22 +173,12 @@ def read_pdf(path: str):
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
# clean_body.append(con_body)
clean_body.append(cleaned[0])
# print(clean_header)
# print(clean_body)
parsed = []
# for cols, rows in zip(clean_header, clean_body):
# parsed.append({
# "title": "",
# "columns": cols,
# "rows": rows
# })
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
parsed.append({
"title": str(i), # bisa juga f"Table {i}" kalau mau format tertentu
"title": str(i),
"columns": cols,
"rows": rows
})
@ -206,28 +192,20 @@ def read_pdf(path: str):
def convert_df(payload):
# Validasi dasar
print(f'payload {payload}')
# Cek apakah keys ada
if "columns" not in payload or "rows" not in payload:
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
# Pastikan columns dan rows berupa list
if not isinstance(payload["columns"], list):
raise TypeError("'columns' harus berupa list.")
if not isinstance(payload["rows"], list):
raise TypeError("'rows' harus berupa list.")
# Pastikan setiap baris punya jumlah kolom yang sama
for i, row in enumerate(payload["rows"]):
if len(row) != len(payload["columns"]):
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
# Konversi menjadi DataFrame
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
# Tambahkan atribut title kalau ada
if "title" in payload:
df.attrs["title"] = payload["title"]