2025-10-29 10:07:48 +00:00
import pdfplumber
import re
import pandas as pd
def is_number ( s ) :
if s is None :
return False
s = str ( s ) . strip ( ) . replace ( ' , ' , ' ' ) . replace ( ' . ' , ' ' )
return s . isdigit ( )
def row_ratio ( row ) :
non_empty = [ c for c in row if c not in ( None , ' ' , ' ' ) ]
if not non_empty :
return 0
num_count = sum ( is_number ( c ) for c in non_empty )
return num_count / len ( non_empty )
def has_mixed_text_and_numbers ( row ) :
non_empty = [ c for c in row if c not in ( None , ' ' , ' ' ) ]
has_text = any ( isinstance ( c , str ) and re . search ( r ' [A-Za-z] ' , str ( c ) ) for c in non_empty )
has_num = any ( is_number ( c ) for c in non_empty )
return has_text and has_num
def is_short_text_row ( row ) :
""" Deteksi baris teks pendek (1-2 kolom teks pendek). """
non_empty = [ str ( c ) . strip ( ) for c in row if c not in ( None , ' ' , ' ' ) ]
if not non_empty :
return False
text_only = all ( not is_number ( c ) for c in non_empty )
joined = " " . join ( non_empty )
return text_only and len ( non_empty ) < = 2 and len ( joined ) < 20
def detect_header_rows ( rows ) :
if not rows :
return [ ]
ratios = [ row_ratio ( r ) for r in rows ]
body_start_index = None
for i in range ( 1 , len ( rows ) ) :
row = rows [ i ]
if has_mixed_text_and_numbers ( row ) :
body_start_index = i
break
if ratios [ i ] > 0.3 :
body_start_index = i
break
if any ( isinstance ( c , str ) and re . match ( r ' ^ \ d+$ ' , c . strip ( ) ) for c in row ) :
body_start_index = i
break
if ratios [ i - 1 ] == 0 and ratios [ i ] > 0 :
body_start_index = i
break
if body_start_index is None :
body_start_index = len ( rows )
potential_headers = rows [ : body_start_index ]
body_filtered = rows [ body_start_index : ]
header_filtered = [ ]
for idx , row in enumerate ( potential_headers ) :
if is_short_text_row ( row ) :
if idx + 1 < len ( potential_headers ) and ratios [ idx + 1 ] == 0 :
header_filtered . append ( row )
else :
continue
else :
header_filtered . append ( row )
return header_filtered , body_filtered
def merge_multiline_header ( header_rows ) :
final_header = [ ]
for col in zip ( * header_rows ) :
val = next ( ( v for v in reversed ( col ) if v and str ( v ) . strip ( ) ) , ' ' )
val = str ( val ) . replace ( ' \n ' , ' ' ) . strip ( )
final_header . append ( val )
final_header = [ v for v in final_header if v not in [ ' ' , None ] ]
return final_header
NUMBER_HEADER_KEYWORDS = [ " no " , " no. " , " no . " , " no . " , " no : " , " no : " , " nomor " , " nomor. " , " nomor : " , " nomor urut " , " no urut " , " no. urut " , " no-urut " , " no_urut " , " nomor_urut " , " nomor-urut " , " No " , " NO " , " NO. " , " No. " , " No : " , " NO : " , " Nomor " , " NOMOR " , " Nomor Urut " , " NOMOR URUT " , " No Urut " , " NO URUT " , " No. Urut " , " NO. URUT " , " No / " , " No / " , " No / Nama " , " No - " , " No - " , " Nomor / " , " Nomor - " , " Number " , " No. of " , " No of " , " Index " , " Serial " , " Order " , " ID " , " ID No " , " ID No. " , " Sr No " , " Sr. No " , " S/N " , " SN " , " Sl No " , " Sl. No " , " N0 " , " N0. " , " N0 : " , " NOM0R " , " NOM0R URUT " , " N0MOR " , ]
def has_number_header ( header ) :
""" Periksa apakah header mengandung kolom No/Nomor. """
header_text = header
return any ( keyword in header_text for keyword in NUMBER_HEADER_KEYWORDS )
def is_numbering_column ( col_values ) :
""" Periksa apakah kolom pertama diisi nomor urut seperti 1, 01, 2, dst. """
numeric_like = 0
total = 0
for v in col_values :
if not v or not isinstance ( v , str ) :
continue
total + = 1
if re . fullmatch ( r " 0* \ d { 1,3} " , v . strip ( ) ) :
numeric_like + = 1
return total > 0 and ( numeric_like / total ) > 0.6
def is_numeric_value ( v ) :
""" Cek apakah suatu nilai termasuk angka (int, float, atau string angka). """
if v is None :
return False
if isinstance ( v , ( int , float ) ) :
return True
if isinstance ( v , str ) and re . fullmatch ( r " 0* \ d { 1,3} " , v . strip ( ) ) :
return True
return False
def cleaning_column ( headers , bodies ) :
cleaned_bodies = [ ]
for header , body in zip ( headers , bodies ) :
if not body :
cleaned_bodies . append ( body )
continue
header_has_number = has_number_header ( header )
first_col = [ row [ 0 ] for row in body if row and len ( row ) > 0 ]
first_col_is_numbering = is_numbering_column ( first_col )
if not header_has_number and first_col_is_numbering :
new_body = [ ]
for row in body :
if not row :
continue
first_val = row [ 0 ]
if is_numeric_value ( first_val ) and len ( row ) > 1 :
new_body . append ( row [ 1 : ] )
else :
new_body . append ( row )
body = new_body
header_len = len ( headers )
filtered_body = [ row for row in body if len ( row ) == header_len ]
cleaned_bodies . append ( filtered_body )
return cleaned_bodies
def read_pdf ( path : str ) :
pdf_path = path
tables_data = [ ]
with pdfplumber . open ( pdf_path ) as pdf :
page = pdf . pages [ 0 ]
tables = page . find_tables ( )
for i , t in enumerate ( tables , start = 1 ) :
table = t . extract ( )
if len ( table ) > 4 :
tables_data . append ( table )
print ( f " \n Total tabel valid: { len ( tables_data ) } \n " )
header_only = [ ]
body_only = [ ]
for tbl in tables_data :
head , body = detect_header_rows ( tbl )
header_only . append ( head )
body_only . append ( body )
clean_header = [ ]
for h in header_only :
clean_header . append ( merge_multiline_header ( h ) )
clean_body = [ ]
for i , raw_body in enumerate ( body_only ) :
con_body = [ [ cell for cell in row if cell not in ( None , ' ' ) ] for row in raw_body ]
cleaned = cleaning_column ( clean_header [ i ] , [ con_body ] )
clean_body . append ( cleaned [ 0 ] )
parsed = [ ]
for i , ( cols , rows ) in enumerate ( zip ( clean_header , clean_body ) , start = 1 ) :
parsed . append ( {
2025-10-30 04:12:57 +00:00
" title " : str ( i ) ,
2025-10-29 10:07:48 +00:00
" columns " : cols ,
" rows " : rows
} )
return parsed
def convert_df ( payload ) :
if " columns " not in payload or " rows " not in payload :
raise ValueError ( " Payload tidak memiliki key ' columns ' atau ' rows ' . " )
if not isinstance ( payload [ " columns " ] , list ) :
raise TypeError ( " ' columns ' harus berupa list. " )
if not isinstance ( payload [ " rows " ] , list ) :
raise TypeError ( " ' rows ' harus berupa list. " )
for i , row in enumerate ( payload [ " rows " ] ) :
if len ( row ) != len ( payload [ " columns " ] ) :
raise ValueError ( f " Jumlah elemen di baris ke- { i } tidak sesuai jumlah kolom. " )
df = pd . DataFrame ( payload [ " rows " ] , columns = payload [ " columns " ] )
if " title " in payload :
df . attrs [ " title " ] = payload [ " title " ]
return df
def test_read_pdf ( ) :
# single
# parsed = [{'title': 'Tabel 3.49. Potensi Penduduk Terpapar Bencana Banjir di Provinsi Jawa Timur', 'columns': ['No', 'Kabupaten/Kota', 'Jumlah Penduduk Terpapar (Jiwa)', 'Penduduk Umur Rentan', 'Penduduk Miskin', 'Penduduk Disabilitas', 'Kelas'], 'rows': [['1', 'PACITAN', '111.309', '14.142', '9.307', '781', 'SEDANG'], ['2', 'PONOROGO', '381.579', '50.815', '44.256', '2.346', 'SEDANG'], ['3', 'TRENGGALEK', '284.509', '34.304', '33.653', '1.945', 'SEDANG'], ['4', 'TULUNGAGUNG', '777.174', '86.452', '67.952', '3.200', 'SEDANG'], ['5', 'BLITAR', '226.767', '25.032', '22.554', '909', 'SEDANG'], ['6', 'KEDIRI', '545.961', '59.272', '74.578', '2.539', 'SEDANG'], ['7', 'MALANG', '238.170', '23.646', '25.388', '641', 'SEDANG'], ['8', 'LUMAJANG', '267.926', '30.206', '33.738', '970', 'SEDANG'], ['9', 'JEMBER', '1.061.703', '109.355', '105.958', '2.424', 'SEDANG'], ['10', 'BANYUWANGI', '442.290', '51.294', '44.107', '1.168', 'SEDANG'], ['11', 'BONDOWOSO', '143.452', '18.178', '21.676', '517', 'SEDANG'], ['12', 'SITUBONDO', '233.211', '26.799', '54.221', '928', 'SEDANG'], ['13', 'PROBOLINGGO', '326.005', '37.002', '58.562', '1.323', 'SEDANG'], ['14', 'PASURUAN', '485.143', '49.285', '65.076', '1.576', 'SEDANG'], ['15', 'SIDOARJO', '1.930.615', '172.191', '132.673', '3.987', 'SEDANG'], ['16', 'MOJOKERTO', '498.583', '52.453', '49.831', '1.491', 'SEDANG'], ['17', 'JOMBANG', '876.937', '92.415', '107.447', '4.985', 'SEDANG'], ['18', 'NGANJUK', '829.022', '95.454', '117.127', '3.029', 'SEDANG'], ['19', 'MADIUN', '363.763', '44.997', '44.877', '1.695', 'SEDANG'], ['20', 'MAGETAN', '117.247', '15.706', '11.051', '652', 'SEDANG'], ['21', 'NGAWI', '419.065', '49.864', '65.877', '1.572', 'SEDANG'], ['22', 'BOJONEGORO', '910.377', '100.800', '117.977', '3.557', 'SEDANG'], ['23', 'TUBAN', '507.407', '51.775', '60.834', '2.206', 'SEDANG'], ['24', 'LAMONGAN', '884.503', '99.928', '96.031', '3.960', 'SEDANG'], ['25', 'GRESIK', '613.133', '59.848', '49.854', '1.666', 'SEDANG'], ['26', 'BANGKALAN', '312.149', '31.075', '36.099', '1.169', 'SEDANG'], ['27', 'SAMPANG', '239.656', '28.756', '39.790', '1.280', 'SEDANG'], ['28', 'PAMEKASAN', '216.423', '25.831', '30.296', '776', 'SEDANG'], ['29', 'SUMENEP', '217.805', '24.741', '33.293', '1.088', 'SEDANG'], ['1', 'KOTA KEDIRI', '162.064', '17.129', '13.997', '363', 'SEDANG'], ['2', 'KOTA BLITAR', '21.390', '2.242', '1.185', '79', 'SEDANG'], ['3', 'KOTA MALANG', '148.072', '15.499', '6.142', '201', 'SEDANG'], ['4', 'KOTA PROBOLINGGO', '117.911', '12.708', '10.913', '420', 'SEDANG'], ['5', 'KOTA PASURUAN', '199.602', '20.199', '19.721', '516', 'SEDANG'], ['6', 'KOTA MOJOKERTO', '139.962', '14.486', '6.971', '584', 'SEDANG'], ['7', 'KOTA MADIUN', '149.468', '17.255', '6.300', '304', 'SEDANG'], ['8', 'KOTA SURABAYA', '2.469.639', '244.061', '133.953', '3.838', 'SEDANG'], ['9', 'KOTA BATU', '8.858', '939', '529', '13', 'SEDANG'], ['-', 'Provinsi Jawa Timur', '17.878.850', '1.906.134', '1.853.794', '60.698', 'SEDANG']]}]
# double
parsed = [ { " title " : " Luas Catchment Area (km2) Pada Wilayah Sungai di Provinsi Jawa Timur " , " columns " : [ " Wilayah Sungai " , " Luas (km2) " , " Jumlah DAS " ] , " rows " : [ [ " Bengawan Solo " , " 13.070,00 " , " 94 DAS " ] , [ " Brantas " , " 13.880,00 " , " 20 DAS " ] , [ " Welang -Rejoso " , " 2.601,00 " , " 36 DAS " ] , [ " Pekalen -Sampean " , " 3.953,00 " , " 56 DAS " ] , [ " Baru -Bajulmati " , " 3.675,00 " , " 60 DAS " ] , [ " Bondoyudo -Bedadung " , " 5.364,00 " , " 47 DAS " ] , [ " Madura " , " 4.575,00 " , " 173 DAS " ] ] } , { " title " : " Jumlah dan Kepadatan Penduduk Menurut Kabupaten \ /kota di Provinsi Jawa Timur Tahun 2021 " , " columns " : [ " Kabupaten \ /Kota " , " Jumlah Penduduk " , " Persentase " , " Kepadatan Penduduk (Jiwa per Km2) " ] , " rows " : [ [ " Bangkalan " , " 1.082.759 " , " 2,64 " , " 1.081,20 " ] , [ " Banyuwangi " , " 1.749.773 " , " 4,27 " , " 302,60 " ] , [ " Blitar " , " 1.228.292 " , " 3,00 " , " 919,05 " ] , [ " Bojonegoro " , " 1.343.895 " , " 3,28 " , " 611,20 " ] , [ " Bondowoso " , " 801.541 " , " 1,96 " , " 525,27 " ] , [ " Gresik " , " 1.283.961 " , " 3,13 " , " 1.077,83 " ] , [ " Jember " , " 2.581.486 " , " 6,30 " , " 834,80 " ] , [ " Jombang " , " 1.350.483 " , " 3,29 " , " 1.211,10 " ] , [ " Kediri " , " 1.671.821 " , " 4,08 " , " 1.206,18 " ] , [ " Lamongan " , " 1.379.731 " , " 3,37 " , " 774,24 " ] , [ " Lumajang " , " 1.091.856 " , " 2,66 " , " 609,67 " ] , [ " Madiun " , " 754.263 " , " 1,84 " , " 726,94 " ] , [ " Magetan " , " 689.369 " , " 1,68 " , " 1.000,77 " ] , [ " Malang " , " 2.611.907 " , " 6,37 " , " 739,78 " ] , [ " Mojokerto " , " 1.126.540 " , " 2,75 " , " 1.569,37 " ] , [ " Nganjuk " , " 1.133.556 " , " 2,77 " , " 925,92 " ] , [ " Ngawi " , " 896.768 " , " 2,19 " , " 691,96 " ] , [ " Pacitan " , " 597.580 " , " 1,46 " , " 429,94 " ] , [ " Pamekasan " , " 840.790 " , " 2,05 " , " 1.061,28 " ] , [ " Pasuruan " , " 1.603.754 " , " 3,91 " , " 1.088,01 " ] , [ " Ponorogo " , " 968.681 " , " 2,36 " , " 741,89 " ] , [ " Probolinggo " , " 1.156.570 " , " 2,82 " , " 681,86 " ] , [ " Sampang " , " 902.514 " , " 2,20 " , " 731,92 " ] , [ " Sidoarjo " , " 1.951.723 " , " 4,76 " , " 3.076,58 " ] , [ " Situbondo " , " 666.245 " , " 1,63 " , " 398,98 " ] , [ " Sumenep " , " 1.134.750 " , " 2,77 " , " 567,79 " ] , [ " Trenggalek " , " 746.734 " , " 1,82 " , " 650,91 " ] , [ " Tuban " , " 1.223.257 " , " 2,98 " , " 666,93 " ] , [ " Tulungagung " , " 1.126.679 " , " 2,75 " , " 1.067,28 " ] , [ " Kota Batu " , " 215.248 " , " 0,53 " , " 1.574,14 " ] , [ " Kota Blitar " , " 158.123 " , " 0,39 " , " 4.854,87 " ] , [ " Kota Kediri " , " 292.363 " , " 0,71 " , " 4.611,40 " ] , [ " Kota Madiun " , " 201.243 " , " 0,49 " , " 6.045,15 " ] , [ " Kota Malang " , " 866.356 " , " 2,11 " , " 5.963,35 " ] , [ " Kota Mojokerto " , " 139.961 " , " 0,34 " , " 8.497,94 " ] , [ " Kota Pasuruan " , " 210.341 " , " 0,51 " , " 5.960,36 " ] , [ " Kota Probolinggo " , " 242.246 " , " 0,59 " , " 4.274,68 " ] , [ " Kota Surabaya " , " 2.970.843 " , " 7,25 " , " 8.475,05 " ] , [ " Provinsi Jawa Timur " , " 40.994.002 " , " 100,00 " , " 76.228,17 " ] ] } ]
# df = convert_df(parsed, table_index=0)
return parsed