file_table_reader/services/reader_pdf.py

264 lines
13 KiB
Python
Raw Normal View History

2025-10-29 10:07:48 +00:00
import pdfplumber
import re
import pandas as pd
2025-11-04 07:22:18 +00:00
from services.filter_column import filter_geo_admin_column
2025-10-29 10:07:48 +00:00
def is_number(s):
if s is None:
return False
s = str(s).strip().replace(',', '').replace('.', '')
return s.isdigit()
def row_ratio(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
if not non_empty:
return 0
num_count = sum(is_number(c) for c in non_empty)
return num_count / len(non_empty)
def has_mixed_text_and_numbers(row):
non_empty = [c for c in row if c not in (None, '', ' ')]
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
has_num = any(is_number(c) for c in non_empty)
return has_text and has_num
def is_short_text_row(row):
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
if not non_empty:
return False
text_only = all(not is_number(c) for c in non_empty)
joined = " ".join(non_empty)
return text_only and len(non_empty) <= 2 and len(joined) < 20
def detect_header_rows(rows):
if not rows:
return []
ratios = [row_ratio(r) for r in rows]
body_start_index = None
for i in range(1, len(rows)):
row = rows[i]
if has_mixed_text_and_numbers(row):
body_start_index = i
break
if ratios[i] > 0.3:
body_start_index = i
break
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
body_start_index = i
break
if ratios[i - 1] == 0 and ratios[i] > 0:
body_start_index = i
break
if body_start_index is None:
body_start_index = len(rows)
potential_headers = rows[:body_start_index]
body_filtered = rows[body_start_index:]
header_filtered = []
for idx, row in enumerate(potential_headers):
if is_short_text_row(row):
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
header_filtered.append(row)
else:
continue
else:
header_filtered.append(row)
return header_filtered, body_filtered
def merge_multiline_header(header_rows):
final_header = []
for col in zip(*header_rows):
val = next((v for v in reversed(col) if v and str(v).strip()), '')
val = str(val).replace('\n', ' ').strip()
final_header.append(val)
final_header = [v for v in final_header if v not in ['', None]]
return final_header
NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",]
def has_number_header(header):
"""Periksa apakah header mengandung kolom No/Nomor."""
header_text = header
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
def is_numbering_column(col_values):
"""Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst."""
numeric_like = 0
total = 0
for v in col_values:
if not v or not isinstance(v, str):
continue
total += 1
if re.fullmatch(r"0*\d{1,3}", v.strip()):
numeric_like += 1
return total > 0 and (numeric_like / total) > 0.6
def is_numeric_value(v):
"""Cek apakah suatu nilai termasuk angka (int, float, atau string angka)."""
if v is None:
return False
if isinstance(v, (int, float)):
return True
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
return True
return False
def cleaning_column(headers, bodies):
cleaned_bodies = []
for header, body in zip(headers, bodies):
if not body:
cleaned_bodies.append(body)
continue
header_has_number = has_number_header(header)
first_col = [row[0] for row in body if row and len(row) > 0]
first_col_is_numbering = is_numbering_column(first_col)
if not header_has_number and first_col_is_numbering:
new_body = []
for row in body:
if not row:
continue
first_val = row[0]
if is_numeric_value(first_val) and len(row) > 1:
new_body.append(row[1:])
else:
new_body.append(row)
body = new_body
header_len = len(headers)
filtered_body = [row for row in body if len(row) == header_len]
cleaned_bodies.append(filtered_body)
return cleaned_bodies
2025-11-04 07:22:18 +00:00
def parse_page_selection(selectedPage: str, total_pages: int):
if not selectedPage:
return list(range(1, total_pages + 1))
2025-10-29 10:07:48 +00:00
2025-11-04 07:22:18 +00:00
pages = set()
parts = re.split(r'[,\s]+', selectedPage.strip())
for part in parts:
if '-' in part:
try:
start, end = map(int, part.split('-'))
pages.update(range(start, end + 1))
except ValueError:
continue
else:
try:
pages.add(int(part))
except ValueError:
continue
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
return valid_pages
def read_pdf(path: str, page: str):
2025-10-29 10:07:48 +00:00
pdf_path = path
2025-11-04 07:22:18 +00:00
selectedPage = page
2025-10-29 10:07:48 +00:00
tables_data = []
with pdfplumber.open(pdf_path) as pdf:
2025-11-04 07:22:18 +00:00
total_pages = len(pdf.pages)
selected_pages = parse_page_selection(selectedPage, total_pages)
print(f"[INFO] Total halaman PDF: {total_pages}")
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
for page_num in selected_pages:
pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
tables = pdf_page.find_tables()
print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
for t in tables:
table = t.extract()
if len(table) > 2:
tables_data.append(table)
print(f"\nTotal tabel valid: {len(tables_data)}\n")
header_only = []
body_only = []
for tbl in tables_data:
head, body = detect_header_rows(tbl)
header_only.append(head)
body_only.append(body)
clean_header = []
for h in header_only:
clean_header.append(merge_multiline_header(h))
clean_body=[]
for i, raw_body in enumerate(body_only):
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
cleaned = cleaning_column(clean_header[i], [con_body])
clean_body.append(cleaned[0])
parsed = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
parsed.append({
"title": str(i),
"columns": cols,
"rows": rows
})
clean_parsed = filter_geo_admin_column(parsed)
print(f"parsed{clean_parsed}")
return clean_parsed
2025-10-29 10:07:48 +00:00
def convert_df(payload):
if "columns" not in payload or "rows" not in payload:
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
if not isinstance(payload["columns"], list):
raise TypeError("'columns' harus berupa list.")
if not isinstance(payload["rows"], list):
raise TypeError("'rows' harus berupa list.")
for i, row in enumerate(payload["rows"]):
if len(row) != len(payload["columns"]):
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
if "title" in payload:
df.attrs["title"] = payload["title"]
return df
def test_read_pdf():
# single
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
# double
parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}]
# df = convert_df(parsed, table_index=0)
return parsed