import pdfplumber import re import pandas as pd from services.upload_file.read_pdf.filter_column import filter_geo_admin_column def is_number(s): if s is None: return False s = str(s).strip().replace(',', '').replace('.', '') return s.isdigit() def row_ratio(row): non_empty = [c for c in row if c not in (None, '', ' ')] if not non_empty: return 0 num_count = sum(is_number(c) for c in non_empty) return num_count / len(non_empty) def has_mixed_text_and_numbers(row): non_empty = [c for c in row if c not in (None, '', ' ')] has_text = any(isinstance(c, str) and re.search(r'[A-Za-z]', str(c)) for c in non_empty) has_num = any(is_number(c) for c in non_empty) return has_text and has_num def is_short_text_row(row): """Deteksi baris teks pendek (1-2 kolom teks pendek).""" non_empty = [str(c).strip() for c in row if c not in (None, '', ' ')] if not non_empty: return False text_only = all(not is_number(c) for c in non_empty) joined = " ".join(non_empty) return text_only and len(non_empty) <= 2 and len(joined) < 20 def detect_header_rows(rows): if not rows: return [] ratios = [row_ratio(r) for r in rows] body_start_index = None for i in range(1, len(rows)): row = rows[i] if has_mixed_text_and_numbers(row): body_start_index = i break if ratios[i] > 0.3: body_start_index = i break if any(isinstance(c, str) and re.match(r'^\d+$', c.strip()) for c in row): body_start_index = i break if ratios[i - 1] == 0 and ratios[i] > 0: body_start_index = i break if body_start_index is None: body_start_index = len(rows) potential_headers = rows[:body_start_index] body_filtered = rows[body_start_index:] header_filtered = [] for idx, row in enumerate(potential_headers): if is_short_text_row(row): if idx + 1 < len(potential_headers) and ratios[idx + 1] == 0: header_filtered.append(row) else: continue else: header_filtered.append(row) return header_filtered, body_filtered def merge_multiline_header(header_rows): final_header = [] for col in zip(*header_rows): val = next((v for v in reversed(col) if v and str(v).strip()), '') val = str(val).replace('\n', ' ').strip() final_header.append(val) final_header = [v for v in final_header if v not in ['', None]] return final_header NUMBER_HEADER_KEYWORDS = ["no","no.","no .","no . ","no :","no : ","nomor","nomor.","nomor :","nomor urut","no urut","no. urut","no-urut","no_urut","nomor_urut","nomor-urut","No","NO","NO.","No.","No :","NO :","Nomor","NOMOR","Nomor Urut","NOMOR URUT","No Urut","NO URUT","No. Urut","NO. URUT","No /","No / ","No / Nama","No -","No - ","Nomor /","Nomor -","Number","No. of","No of","Index","Serial","Order","ID","ID No","ID No.","Sr No","Sr. No","S/N","SN","Sl No","Sl. No","N0","N0.","N0 :","NOM0R","NOM0R URUT","N0MOR",] def has_number_header(header): """Periksa apakah header mengandung kolom No/Nomor.""" header_text = header return any(keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS) def is_numbering_column(col_values): """Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst.""" numeric_like = 0 total = 0 for v in col_values: if not v or not isinstance(v, str): continue total += 1 if re.fullmatch(r"0*\d{1,3}", v.strip()): numeric_like += 1 return total > 0 and (numeric_like / total) > 0.6 def is_numeric_value(v): """Cek apakah suatu nilai termasuk angka (int, float, atau string angka).""" if v is None: return False if isinstance(v, (int, float)): return True if isinstance(v, str) and re.fullmatch(r"0*\d{1,3}", v.strip()): return True return False def cleaning_column(headers, bodies): cleaned_bodies = [] for header, body in zip(headers, bodies): if not body: cleaned_bodies.append(body) continue header_has_number = has_number_header(header) first_col = [row[0] for row in body if row and len(row) > 0] first_col_is_numbering = is_numbering_column(first_col) if not header_has_number and first_col_is_numbering: new_body = [] for row in body: if not row: continue first_val = row[0] if is_numeric_value(first_val) and len(row) > 1: new_body.append(row[1:]) else: new_body.append(row) body = new_body header_len = len(headers) filtered_body = [row for row in body if len(row) == header_len] cleaned_bodies.append(filtered_body) return cleaned_bodies def parse_page_selection(selectedPage: str, total_pages: int): if not selectedPage: return list(range(1, total_pages + 1)) pages = set() parts = re.split(r'[,\s]+', selectedPage.strip()) for part in parts: if '-' in part: try: start, end = map(int, part.split('-')) pages.update(range(start, end + 1)) except ValueError: continue else: try: pages.add(int(part)) except ValueError: continue valid_pages = [p for p in sorted(pages) if 1 <= p <= total_pages] return valid_pages def read_pdf(path: str, page: str): pdf_path = path selectedPage = None if page == '' or None: selectedPage = "1" tables_data = [] with pdfplumber.open(pdf_path) as pdf: total_pages = len(pdf.pages) selected_pages = parse_page_selection(selectedPage, total_pages) print(f"[INFO] Total halaman PDF: {total_pages}") print(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") for page_num in selected_pages: pdf_page = pdf.pages[page_num - 1] # index pdfplumber mulai dari 0 tables = pdf_page.find_tables() print(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") for t in tables: table = t.extract() if len(table) > 2: tables_data.append(table) print(f"\nTotal tabel valid: {len(tables_data)}\n") header_only = [] body_only = [] for tbl in tables_data: head, body = detect_header_rows(tbl) header_only.append(head) body_only.append(body) clean_header = [] for h in header_only: clean_header.append(merge_multiline_header(h)) clean_body=[] for i, raw_body in enumerate(body_only): con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body] cleaned = cleaning_column(clean_header[i], [con_body]) clean_body.append(cleaned[0]) parsed = [] for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): parsed.append({ "title": str(i), "columns": cols, "rows": rows }) clean_parsed = filter_geo_admin_column(parsed) # print(f"parsed{clean_parsed}") return clean_parsed def convert_df(payload): if "columns" not in payload or "rows" not in payload: raise ValueError("Payload tidak memiliki key 'columns' atau 'rows'.") if not isinstance(payload["columns"], list): raise TypeError("'columns' harus berupa list.") if not isinstance(payload["rows"], list): raise TypeError("'rows' harus berupa list.") for i, row in enumerate(payload["rows"]): if len(row) != len(payload["columns"]): raise ValueError(f"Jumlah elemen di baris ke-{i} tidak sesuai jumlah kolom.") df = pd.DataFrame(payload["rows"], columns=payload["columns"]) if "title" in payload: df.attrs["title"] = payload["title"] return df def test_read_pdf(): # single # parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}] # double parsed = [{"title":"Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur","columns":["Wilayah Sungai","Luas (km2)","Jumlah DAS"],"rows":[["Bengawan Solo","13.070,00","94 DAS"],["Brantas","13.880,00","20 DAS"],["Welang -Rejoso","2.601,00","36 DAS"],["Pekalen -Sampean","3.953,00","56 DAS"],["Baru -Bajulmati","3.675,00","60 DAS"],["Bondoyudo -Bedadung","5.364,00","47 DAS"],["Madura","4.575,00","173 DAS"]]},{"title":"Jumlah dan Kepadatan Penduduk Menurut Kabupaten\/kota di Provinsi Jawa Timur Tahun 2021","columns":["Kabupaten\/Kota","Jumlah Penduduk","Persentase","Kepadatan Penduduk (Jiwa per Km2)"],"rows":[["Bangkalan","1.082.759","2,64","1.081,20"],["Banyuwangi","1.749.773","4,27","302,60"],["Blitar","1.228.292","3,00","919,05"],["Bojonegoro","1.343.895","3,28","611,20"],["Bondowoso","801.541","1,96","525,27"],["Gresik","1.283.961","3,13","1.077,83"],["Jember","2.581.486","6,30","834,80"],["Jombang","1.350.483","3,29","1.211,10"],["Kediri","1.671.821","4,08","1.206,18"],["Lamongan","1.379.731","3,37","774,24"],["Lumajang","1.091.856","2,66","609,67"],["Madiun","754.263","1,84","726,94"],["Magetan","689.369","1,68","1.000,77"],["Malang","2.611.907","6,37","739,78"],["Mojokerto","1.126.540","2,75","1.569,37"],["Nganjuk","1.133.556","2,77","925,92"],["Ngawi","896.768","2,19","691,96"],["Pacitan","597.580","1,46","429,94"],["Pamekasan","840.790","2,05","1.061,28"],["Pasuruan","1.603.754","3,91","1.088,01"],["Ponorogo","968.681","2,36","741,89"],["Probolinggo","1.156.570","2,82","681,86"],["Sampang","902.514","2,20","731,92"],["Sidoarjo","1.951.723","4,76","3.076,58"],["Situbondo","666.245","1,63","398,98"],["Sumenep","1.134.750","2,77","567,79"],["Trenggalek","746.734","1,82","650,91"],["Tuban","1.223.257","2,98","666,93"],["Tulungagung","1.126.679","2,75","1.067,28"],["Kota Batu","215.248","0,53","1.574,14"],["Kota Blitar","158.123","0,39","4.854,87"],["Kota Kediri","292.363","0,71","4.611,40"],["Kota Madiun","201.243","0,49","6.045,15"],["Kota Malang","866.356","2,11","5.963,35"],["Kota Mojokerto","139.961","0,34","8.497,94"],["Kota Pasuruan","210.341","0,51","5.960,36"],["Kota Probolinggo","242.246","0,59","4.274,68"],["Kota Surabaya","2.970.843","7,25","8.475,05"],["Provinsi Jawa Timur","40.994.002","100,00","76.228,17"]]}] # df = convert_df(parsed, table_index=0) return parsed