264 lines
13 KiB
Python
264 lines
13 KiB
Python
import pdfplumber
|
|
import re
|
|
import pandas as pd
|
|
from services.filter_column import filter_geo_admin_column
|
|
|
|
def is_number(s):
|
|
if s is None:
|
|
return False
|
|
s = str(s).strip().replace(',', '').replace('.', '')
|
|
return s.isdigit()
|
|
|
|
def row_ratio(row):
|
|
non_empty = [c for c in row if c not in (None, '', ' ')]
|
|
if not non_empty:
|
|
return 0
|
|
num_count = sum(is_number(c) for c in non_empty)
|
|
return num_count / len(non_empty)
|
|
|
|
def has_mixed_text_and_numbers(row):
|
|
non_empty = [c for c in row if c not in (None, '', ' ')]
|
|
has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty)
|
|
has_num = any(is_number(c) for c in non_empty)
|
|
return has_text and has_num
|
|
|
|
def is_short_text_row(row):
|
|
"""Deteksi baris teks pendek (1-2 kolom teks pendek)."""
|
|
non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')]
|
|
if not non_empty:
|
|
return False
|
|
text_only = all(not is_number(c) for c in non_empty)
|
|
joined = " ".join(non_empty)
|
|
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
|
|
|
def detect_header_rows(rows):
|
|
if not rows:
|
|
return []
|
|
|
|
ratios = [row_ratio(r) for r in rows]
|
|
body_start_index = None
|
|
|
|
for i in range(1, len(rows)):
|
|
row = rows[i]
|
|
if has_mixed_text_and_numbers(row):
|
|
body_start_index = i
|
|
break
|
|
if ratios[i] > 0.3:
|
|
body_start_index = i
|
|
break
|
|
if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row):
|
|
body_start_index = i
|
|
break
|
|
if ratios[i - 1] == 0 and ratios[i] > 0:
|
|
body_start_index = i
|
|
break
|
|
|
|
if body_start_index is None:
|
|
body_start_index = len(rows)
|
|
|
|
potential_headers = rows[:body_start_index]
|
|
body_filtered = rows[body_start_index:]
|
|
header_filtered = []
|
|
for idx, row in enumerate(potential_headers):
|
|
if is_short_text_row(row):
|
|
if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0:
|
|
header_filtered.append(row)
|
|
else:
|
|
continue
|
|
else:
|
|
header_filtered.append(row)
|
|
|
|
return header_filtered, body_filtered
|
|
|
|
|
|
def merge_multiline_header(header_rows):
|
|
final_header = []
|
|
for col in zip(*header_rows):
|
|
val = next((v for v in reversed(col) if v and str(v).strip()), '')
|
|
val = str(val).replace('\n', ' ').strip()
|
|
final_header.append(val)
|
|
final_header = [v for v in final_header if v not in ['', None]]
|
|
return final_header
|
|
|
|
|
|
|
|
NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",]
|
|
|
|
def has_number_header(header):
|
|
"""Periksa apakah header mengandung kolom No/Nomor."""
|
|
header_text = header
|
|
return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS)
|
|
|
|
def is_numbering_column(col_values):
|
|
"""Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst."""
|
|
numeric_like = 0
|
|
total = 0
|
|
for v in col_values:
|
|
if not v or not isinstance(v, str):
|
|
continue
|
|
total += 1
|
|
if re.fullmatch(r"0*\d{1,3}", v.strip()):
|
|
numeric_like += 1
|
|
return total > 0 and (numeric_like / total) > 0.6
|
|
|
|
def is_numeric_value(v):
|
|
"""Cek apakah suatu nilai termasuk angka (int, float, atau string angka)."""
|
|
if v is None:
|
|
return False
|
|
if isinstance(v, (int, float)):
|
|
return True
|
|
if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()):
|
|
return True
|
|
return False
|
|
|
|
def cleaning_column(headers, bodies):
|
|
cleaned_bodies = []
|
|
|
|
for header, body in zip(headers, bodies):
|
|
if not body:
|
|
cleaned_bodies.append(body)
|
|
continue
|
|
|
|
header_has_number = has_number_header(header)
|
|
first_col = [row[0] for row in body if row and len(row) > 0]
|
|
first_col_is_numbering = is_numbering_column(first_col)
|
|
|
|
if not header_has_number and first_col_is_numbering:
|
|
new_body = []
|
|
for row in body:
|
|
if not row:
|
|
continue
|
|
first_val = row[0]
|
|
if is_numeric_value(first_val) and len(row) > 1:
|
|
new_body.append(row[1:])
|
|
else:
|
|
new_body.append(row)
|
|
body = new_body
|
|
|
|
header_len = len(headers)
|
|
filtered_body = [row for row in body if len(row) == header_len]
|
|
|
|
cleaned_bodies.append(filtered_body)
|
|
|
|
return cleaned_bodies
|
|
|
|
|
|
|
|
|
|
def parse_page_selection(selectedPage: str, total_pages: int):
|
|
if not selectedPage:
|
|
return list(range(1, total_pages + 1))
|
|
|
|
pages = set()
|
|
parts = re.split(r'[,\s]+', selectedPage.strip())
|
|
|
|
for part in parts:
|
|
if '-' in part:
|
|
try:
|
|
start, end = map(int, part.split('-'))
|
|
pages.update(range(start, end + 1))
|
|
except ValueError:
|
|
continue
|
|
else:
|
|
try:
|
|
pages.add(int(part))
|
|
except ValueError:
|
|
continue
|
|
|
|
valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages]
|
|
return valid_pages
|
|
|
|
|
|
|
|
def read_pdf(path: str, page: str):
|
|
pdf_path = path
|
|
selectedPage = page
|
|
tables_data = []
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
total_pages = len(pdf.pages)
|
|
selected_pages = parse_page_selection(selectedPage, total_pages)
|
|
|
|
print(f"[INFO] Total halaman PDF: {total_pages}")
|
|
print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
|
|
|
for page_num in selected_pages:
|
|
pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0
|
|
tables = pdf_page.find_tables()
|
|
print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
|
|
|
for t in tables:
|
|
table = t.extract()
|
|
if len(table) > 2:
|
|
tables_data.append(table)
|
|
|
|
print(f"\nTotal tabel valid: {len(tables_data)}\n")
|
|
|
|
header_only = []
|
|
body_only = []
|
|
for tbl in tables_data:
|
|
head, body = detect_header_rows(tbl)
|
|
header_only.append(head)
|
|
body_only.append(body)
|
|
|
|
clean_header = []
|
|
for h in header_only:
|
|
clean_header.append(merge_multiline_header(h))
|
|
|
|
clean_body=[]
|
|
for i, raw_body in enumerate(body_only):
|
|
con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
|
cleaned = cleaning_column(clean_header[i], [con_body])
|
|
clean_body.append(cleaned[0])
|
|
|
|
parsed = []
|
|
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
|
parsed.append({
|
|
"title": str(i),
|
|
"columns": cols,
|
|
"rows": rows
|
|
})
|
|
|
|
clean_parsed = filter_geo_admin_column(parsed)
|
|
print(f"parsed{clean_parsed}")
|
|
return clean_parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_df(payload):
|
|
if "columns" not in payload or "rows" not in payload:
|
|
raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.")
|
|
|
|
if not isinstance(payload["columns"], list):
|
|
raise TypeError("'columns' harus berupa list.")
|
|
if not isinstance(payload["rows"], list):
|
|
raise TypeError("'rows' harus berupa list.")
|
|
|
|
for i, row in enumerate(payload["rows"]):
|
|
if len(row) != len(payload["columns"]):
|
|
raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.")
|
|
|
|
df = pd.DataFrame(payload["rows"], columns=payload["columns"])
|
|
|
|
if "title" in payload:
|
|
df.attrs["title"] = payload["title"]
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_read_pdf():
|
|
# single
|
|
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
|
|
|
|
# double
|
|
parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}]
|
|
# df = convert_df(parsed, table_index=0)
|
|
return parsed |