cleaning reader_pdf
This commit is contained in:
parent
3a35401c1a
commit
bd1f6263ca
|
|
@ -77,10 +77,6 @@ def merge_multiline_header(header_rows):
|
||||||
val = str(val).replace('\n', ' ').strip()
|
val = str(val).replace('\n', ' ').strip()
|
||||||
final_header.append(val)
|
final_header.append(val)
|
||||||
final_header = [v for v in final_header if v not in ['', None]]
|
final_header = [v for v in final_header if v not in ['', None]]
|
||||||
|
|
||||||
# header_string = ' | '.join(final_header)
|
|
||||||
# return header_string
|
|
||||||
|
|
||||||
return final_header
|
return final_header
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -177,22 +173,12 @@ def read_pdf(path: str):
|
||||||
for i, raw_body in enumerate(body_only):
|
for i, raw_body in enumerate(body_only):
|
||||||
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||||
cleaned = cleaning_column(clean_header[i], [con_body])
|
cleaned = cleaning_column(clean_header[i], [con_body])
|
||||||
# clean_body.append(con_body)
|
|
||||||
clean_body.append(cleaned[0])
|
clean_body.append(cleaned[0])
|
||||||
|
|
||||||
# print(clean_header)
|
|
||||||
# print(clean_body)
|
|
||||||
|
|
||||||
parsed = []
|
parsed = []
|
||||||
# for cols, rows in zip(clean_header, clean_body):
|
|
||||||
# parsed.append({
|
|
||||||
# "title": "",
|
|
||||||
# "columns": cols,
|
|
||||||
# "rows": rows
|
|
||||||
# })
|
|
||||||
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||||
parsed.append({
|
parsed.append({
|
||||||
"title": str(i), # bisa juga f"Table {i}" kalau mau format tertentu
|
"title": str(i),
|
||||||
"columns": cols,
|
"columns": cols,
|
||||||
"rows": rows
|
"rows": rows
|
||||||
})
|
})
|
||||||
|
|
@ -206,28 +192,20 @@ def read_pdf(path: str):
|
||||||
|
|
||||||
|
|
||||||
def convert_df(payload):
|
def convert_df(payload):
|
||||||
# Validasi dasar
|
|
||||||
print(f'payload {payload}')
|
|
||||||
|
|
||||||
# Cek apakah keys ada
|
|
||||||
if "columns" not in payload or "rows" not in payload:
|
if "columns" not in payload or "rows" not in payload:
|
||||||
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
||||||
|
|
||||||
# Pastikan columns dan rows berupa list
|
|
||||||
if not isinstance(payload["columns"], list):
|
if not isinstance(payload["columns"], list):
|
||||||
raise TypeError("'columns' harus berupa list.")
|
raise TypeError("'columns' harus berupa list.")
|
||||||
if not isinstance(payload["rows"], list):
|
if not isinstance(payload["rows"], list):
|
||||||
raise TypeError("'rows' harus berupa list.")
|
raise TypeError("'rows' harus berupa list.")
|
||||||
|
|
||||||
# Pastikan setiap baris punya jumlah kolom yang sama
|
|
||||||
for i, row in enumerate(payload["rows"]):
|
for i, row in enumerate(payload["rows"]):
|
||||||
if len(row) != len(payload["columns"]):
|
if len(row) != len(payload["columns"]):
|
||||||
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
||||||
|
|
||||||
# Konversi menjadi DataFrame
|
|
||||||
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
||||||
|
|
||||||
# Tambahkan atribut title kalau ada
|
|
||||||
if "title" in payload:
|
if "title" in payload:
|
||||||
df.attrs["title"] = payload["title"]
|
df.attrs["title"] = payload["title"]
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user