update automation service

This commit is contained in:
DmsAnhr 2026-01-28 12:42:46 +07:00
parent 466820e1f6
commit 470c87f8c4
10 changed files with 558 additions and 89 deletions

3
.gitignore vendored
View File

@ -16,3 +16,6 @@ uploads/
scrapp/ scrapp/
logs/ logs/
style_temp/ style_temp/
services/styles/
cleansing_func.sql

View File

@ -1,8 +1,10 @@
import asyncio import asyncio
from uuid import uuid4 from uuid import uuid4
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException
import httpx
import requests import requests
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
from database.connection import engine from database.connection import engine
from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas
from response import successRes, errorRes from response import successRes, errorRes
@ -10,7 +12,7 @@ from services.datasets.publish_geonetwork import publish_metadata
from services.datasets.publish_geoserver import publish_layer_to_geoserver from services.datasets.publish_geoserver import publish_layer_to_geoserver
from services.datasets.metadata import update_job_status from services.datasets.metadata import update_job_status
from services.upload_file.upload_ws import report_progress from services.upload_file.upload_ws import report_progress
from core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, QGIS_URL from core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, QGIS_URL, MAIN_API_URL, SERVICE_KEY
router = APIRouter() router = APIRouter()
@ -207,3 +209,110 @@ def get_style(style_name: str, workspace: str = None):
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
raise HTTPException(status_code=500, detail=f"Request error: {str(e)}") raise HTTPException(status_code=500, detail=f"Request error: {str(e)}")
# =============================================================
# cleansing query
# =============================================================
# async def query_cleansing_data_fn(
# table_name: str
# ):
# try:
# async with engine.begin() as conn:
# await conn.execute(
# text("SELECT public.fn_cleansing_satupeta_polygon(:table_name)"),
# {"table_name": table_name}
# )
# return "done"
# except SQLAlchemyError as e:
# raise RuntimeError(f"Fix geometry failed: {str(e)}")
async def query_cleansing_data(
table_name: str
):
try:
async with engine.begin() as conn:
await conn.execute(
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
{"table_name": table_name}
)
return "done"
except SQLAlchemyError as e:
raise RuntimeError(f"Fix geometry failed: {str(e)}")
async def publish_layer(table_name: str, job_id: str):
# await asyncio.sleep(10)
try:
await report_progress(job_id, "cleansing", 50, "Cleansing data selesai")
# await asyncio.sleep(5)
geos_link = publish_layer_to_geoserver(table_name, job_id)
await report_progress(job_id, "publish_geoserver", 80, "Publish GeoServer selesai")
# await asyncio.sleep(3)
uuid = publish_metadata(
table_name=table_name,
geoserver_links=geos_link
)
await report_progress(job_id, "done", 100, "Publish GeoNetwork selesai")
update_job_status(table_name, "FINISHED", job_id)
# return uuid
return {
"geos_link": geos_link["layer_url"],
"uuid": uuid
}
except Exception as e:
update_job_status(table_name, "FAILED", job_id)
raise RuntimeError(f"Publish layer gagal: {e}") from e
async def upload_to_main(payload: dict):
try:
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(
MAIN_API_URL+"/api/internal/mapsets",
json=payload,
headers={
"X-SERVICE-KEY": SERVICE_KEY
}
)
response.raise_for_status()
print("GOOOOO")
return {
"status": "success",
"data": response.json()
}
except httpx.RequestError as e:
# error koneksi, DNS, timeout, dll
raise HTTPException(
status_code=500,
detail=f"Gagal connect ke MAIN_API: {str(e)}"
)
except httpx.HTTPStatusError as e:
# API tujuan balas tapi error (4xx / 5xx)
raise HTTPException(
status_code=e.response.status_code,
detail=f"MAIN_API error: {e.response.text}"
)
except Exception as e:
# fallback kalau ada error lain
raise HTTPException(
status_code=500,
detail=f"Unexpected error: {str(e)}"
)

View File

@ -6,6 +6,9 @@ load_dotenv()
API_VERSION = "2.1.3" API_VERSION = "2.1.3"
MAIN_API_URL = os.getenv("MAIN_API_URL")
SERVICE_KEY = os.getenv("SERVICE_KEY")
POSTGIS_URL = os.getenv("POSTGIS_URL") POSTGIS_URL = os.getenv("POSTGIS_URL")
POSTGIS_SYNC_URL = os.getenv("SYNC_URL") POSTGIS_SYNC_URL = os.getenv("SYNC_URL")
@ -26,6 +29,7 @@ UPLOAD_FOLDER = Path(os.getenv("UPLOAD_FOLDER", "./uploads"))
MAX_FILE_MB = int(os.getenv("MAX_FILE_MB", 30)) MAX_FILE_MB = int(os.getenv("MAX_FILE_MB", 30))
ALLOWED_ORIGINS = [ ALLOWED_ORIGINS = [
"http://localhost:4000",
"http://localhost:3000", "http://localhost:3000",
"http://127.0.0.1:3000", "http://127.0.0.1:3000",
"http://localhost:5173", "http://localhost:5173",

View File

@ -24,9 +24,9 @@ async def loginService(username: str, password: str, db: AsyncSession):
if not pwd_context.verify(password, user.password_hash): if not pwd_context.verify(password, user.password_hash):
raise errorRes(status_code=401, message="Invalid username or password") raise errorRes(status_code=401, message="Invalid username or password")
# Validation for institution user # Validation for organization user
if user.role != "admin" and not user.institution_id: if user.role != "admin" and not user.organization_id:
raise errorRes(status_code=403, message="User must belong to an institution") raise errorRes(status_code=403, message="User must belong to an organization")
# Generate single active token # Generate single active token
token = str(uuid4()) token = str(uuid4())
@ -41,7 +41,7 @@ async def loginService(username: str, password: str, db: AsyncSession):
"status": "success", "status": "success",
"username": user.username, "username": user.username,
"role": user.role, "role": user.role,
"institution_id": user.institution_id, "organization_id": user.organization_id,
"token": token, "token": token,
"token_expired_at": expiry.isoformat() "token_expired_at": expiry.isoformat()
} }

View File

@ -4,7 +4,6 @@ from database.connection import sync_engine
from utils.logger_config import log_activity from utils.logger_config import log_activity
def update_job_status(table_name: str, status: str, job_id: str = None): def update_job_status(table_name: str, status: str, job_id: str = None):
print("update status")
query = text(""" query = text("""
UPDATE backend.author_metadata UPDATE backend.author_metadata
SET process = :status, SET process = :status,

View File

@ -7,6 +7,22 @@ from datetime import datetime
from uuid import uuid4 from uuid import uuid4
import re import re
def create_gn_session():
session = requests.Session()
session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
session.get(f"{GEONETWORK_URL}/srv/eng/info?type=me")
xsrf_token = session.cookies.get("XSRF-TOKEN")
if not xsrf_token:
raise Exception("XSRF token missing")
return session, xsrf_token
def escape_url_params(url: str) -> str: def escape_url_params(url: str) -> str:
""" """
Escape karakter berbahaya di dalam URL agar valid dalam XML. Escape karakter berbahaya di dalam URL agar valid dalam XML.
@ -73,30 +89,6 @@ def get_extent(table_name: str):
"ymax": -5.4819 # north "ymax": -5.4819 # north
} }
# def get_author_metadata(table_name: str):
# sql = """
# SELECT table_title, dataset_title, dataset_abstract, keywords, date_created,
# organization_name, contact_person_name,
# contact_email, contact_phone, geom_type
# FROM backend.author_metadata
# WHERE table_title = :table
# LIMIT 1
# """
# conn = engine.connect()
# try:
# row = conn.execute(text(sql), {"table": table_name}).fetchone()
# finally:
# conn.close()
# if not row:
# raise Exception(f"Tidak ada metadata untuk tabel: {table_name}")
# # FIX: SQLAlchemy Row → dict
# return dict(row._mapping)
def get_author_metadata(table_name: str): def get_author_metadata(table_name: str):
sql = """ sql = """
@ -356,10 +348,10 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
<gmd:pointOfContact> <gmd:pointOfContact>
<gmd:CI_ResponsibleParty> <gmd:CI_ResponsibleParty>
<gmd:individualName> <gmd:individualName>
<gco:CharacterString>Dinas Tenaga Kerja dan Transmigrasi Provinsi Jawa Timur</gco:CharacterString> <gco:CharacterString>Lab AI Polinema</gco:CharacterString>
</gmd:individualName> </gmd:individualName>
<gmd:organisationName> <gmd:organisationName>
<gco:CharacterString>Dinas Tenaga Kerja dan Transmigrasi Provinsi Jawa Timur</gco:CharacterString> <gco:CharacterString>Lab AI Polinema</gco:CharacterString>
</gmd:organisationName> </gmd:organisationName>
<gmd:positionName gco:nilReason="missing"/> <gmd:positionName gco:nilReason="missing"/>
<gmd:contactInfo> <gmd:contactInfo>
@ -510,9 +502,7 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
<gmd:onLine> <gmd:onLine>
<gmd:CI_OnlineResource> <gmd:CI_OnlineResource>
<gmd:linkage> <gmd:linkage>
<gmd:URL> <gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
https://geoserver.jatimprov.go.id/wms?service=WMS&amp;version=1.1.0&amp;request=GetMap&amp;layers=satupeta:fngsl_trw_1_4_2020_2023&amp;bbox=110.89527893066406,-8.78022289276123,116.27019500732422,-5.042964935302734&amp;width=768&amp;height=534&amp;srs=EPSG:4326&amp;styles=&amp;format=application/openlayers
</gmd:URL>
</gmd:linkage> </gmd:linkage>
<gmd:protocol> <gmd:protocol>
<gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString> <gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString>
@ -578,21 +568,23 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
""" """
# Geonetwork version 4.4.9.0
def upload_metadata_to_geonetwork(xml_metadata: str): def upload_metadata_to_geonetwork(xml_metadata: str):
session = requests.Session() # session = requests.Session()
session.auth = (GEONETWORK_USER, GEONETWORK_PASS) # session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
# 1. Get XSRF token # # 1. Get XSRF token
try: # try:
info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me" # info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
session.get(info_url) # session.get(info_url)
except requests.exceptions.RequestException as e: # except requests.exceptions.RequestException as e:
raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}") # raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
xsrf_token = session.cookies.get('XSRF-TOKEN') # xsrf_token = session.cookies.get('XSRF-TOKEN')
if not xsrf_token: # if not xsrf_token:
raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.") # raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
session, xsrf_token = create_gn_session()
headers = { headers = {
'X-XSRF-TOKEN': xsrf_token, 'X-XSRF-TOKEN': xsrf_token,
'Accept': 'application/json' 'Accept': 'application/json'
@ -605,15 +597,32 @@ def upload_metadata_to_geonetwork(xml_metadata: str):
'file': ('metadata.xml', xml_metadata, 'application/xml') 'file': ('metadata.xml', xml_metadata, 'application/xml')
} }
params = {
"ownerGroup": 1, # all
"ownerUser": 1 # admin
}
response = session.post( response = session.post(
GN_API_RECORDS_URL, GN_API_RECORDS_URL,
params=params,
files=files, files=files,
headers=headers, headers=headers,
cookies=session.cookies.get_dict() cookies=session.cookies.get_dict()
) )
metadata_infos = response.json().get("metadataInfos", {})
uuid = None
for records in metadata_infos.values():
if records and isinstance(records, list):
uuid = records[0].get("uuid")
break
if not uuid:
raise ValueError("UUID not found in GeoNetwork response")
publish_record(session, uuid)
# print("response", response.json()) # print("response", response.json())
return response.json() return uuid
@ -629,11 +638,59 @@ def publish_metadata(table_name: str, geoserver_links: dict):
) )
xml_clean = fix_xml_urls(xml) xml_clean = fix_xml_urls(xml)
response = upload_metadata_to_geonetwork(xml_clean) uuid = upload_metadata_to_geonetwork(xml_clean)
uuid = response.get("uuid")
print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}") print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}")
return uuid return uuid
def publish_record(session, uuid):
print('[uuid]', uuid)
xsrf_token = session.cookies.get('XSRF-TOKEN')
headers = {
"X-XSRF-TOKEN": xsrf_token,
"Accept": "application/json",
"Content-Type": "application/json"
}
url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
payload = {
"clear": True,
"privileges": [
{
"group": 1,
"operations": {
"view": True
}
}
]
}
response = session.put(url, json=payload, headers=headers)
response.raise_for_status()
# single stand func
# def publish_record(uuid):
# session, xsrf_token = create_gn_session()
# headers = {
# "X-XSRF-TOKEN": xsrf_token,
# "Content-Type": "application/json"
# }
# url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
# payload = {
# "clear": True,
# "privileges": [
# {"group": 1, "operations": {"view": True}}
# ]
# }
# resp = session.put(url, json=payload, headers=headers)
# resp.raise_for_status()

View File

@ -52,18 +52,52 @@ def publish_layer_to_geoserver(table: str, job_id: str):
print(f"[WARNING] SLD file tidak ditemukan: {sld_file}") print(f"[WARNING] SLD file tidak ditemukan: {sld_file}")
else: else:
print(f"[GeoServer] Upload SLD {sld_file}") print(f"[GeoServer] Upload SLD {sld_file}")
style_url = f"{GEOSERVER_URL}/rest/styles"
with open(sld_file, "rb") as sld: #old
requests.post( # style_url = f"{GEOSERVER_URL}/rest/styles"
f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
# with open(sld_file, "rb") as sld:
# requests.post(
# f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
# data=sld.read()
# )
# print(f"[GeoServer] SLD uploaded: {style_name}")
#new
style_url = (
f"{GEOSERVER_URL}/rest/workspaces/"
f"{GEOSERVER_WORKSPACE}/styles"
)
with open(sld_file, "r", encoding="utf-8") as f:
sld_content = f.read()
# 🔥 INI BARIS PENTINGNYA
sld_content = sld_content.lstrip("\ufeff \t\r\n")
resp = requests.post(
f"{style_url}?name={style_name}",
auth=(GEOSERVER_USER, GEOSERVER_PASS), auth=(GEOSERVER_USER, GEOSERVER_PASS),
headers={"Content-Type": "application/vnd.ogc.sld+xml"}, headers={"Content-Type": "application/vnd.ogc.sld+xml"},
data=sld.read() data=sld_content.encode("utf-8")
)
if resp.status_code not in (200, 201):
raise Exception(
f"Upload SLD gagal ({resp.status_code}): {resp.text}"
) )
print(f"[GeoServer] SLD uploaded: {style_name}") print(f"[GeoServer] SLD uploaded: {style_name}")
# ========================================== # ==========================================
# 3. Apply SLD to the layer # 3. Apply SLD to the layer
# ========================================== # ==========================================
@ -116,14 +150,29 @@ def publish_layer_to_geoserver(table: str, job_id: str):
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?" f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?"
f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}" f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}"
) )
print(f"[GeoServer] WMS URL: {wms_link}") # print(f"[GeoServer] WMS URL: {wms_link}")
print(f"[GeoServer] WFS URL: {wfs_link}") # print(f"[GeoServer] WFS URL: {wfs_link}")
print(f"[GeoServer] Reload completed. Layer {table} ready.") # print(f"[GeoServer] Reload completed. Layer {table} ready.")
openlayer_url = (
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
f"service=WMS"
f"&version=1.1.0"
f"&request=GetMap"
f"&layers={GEOSERVER_WORKSPACE}:{table}"
f"&styles="
f"&bbox=110.89528623700005%2C-8.780412043999945%2C116.26994997700001%2C-5.042971664999925"
f"&width=768"
f"&height=384"
f"&srs=EPSG:4326"
f"&format=application/openlayers"
)
return { return {
"table": table, "table": table,
"style": style_name, "style": style_name,
"wms_url": wms_link, "wms_url": wms_link,
"wfs_url": wfs_link "wfs_url": wfs_link,
"layer_url": openlayer_url
} }

View File

@ -1,7 +1,7 @@
import re import re
import pdfplumber import pdfplumber
import pandas as pd import pandas as pd
from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column from services.upload_file.utils.pdf_cleaner import get_number_column_index, get_start_end_number, normalize_number_column, row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
from services.upload_file.upload_exceptions import PDFReadError from services.upload_file.upload_exceptions import PDFReadError
from utils.logger_config import setup_logger from utils.logger_config import setup_logger
@ -56,6 +56,41 @@ def merge_multiline_header(header_rows):
final_header = [v for v in final_header if v not in ['', None]] final_header = [v for v in final_header if v not in ['', None]]
return final_header return final_header
def merge_parsed_table(tables):
roots = []
fragments = []
# STEP 1: klasifikasi
for table in tables:
num_idx = get_number_column_index(table["columns"])
if num_idx is None:
roots.append(table)
continue
start_no, _ = get_start_end_number(table["rows"], num_idx)
if start_no == 1:
roots.append(table)
else:
fragments.append(table)
# STEP 2: merge fragment ke root
for frag in fragments:
frag_idx = get_number_column_index(frag["columns"])
f_start, _ = get_start_end_number(frag["rows"], frag_idx)
for root in roots:
if root["columns"] != frag["columns"]:
continue
root_idx = get_number_column_index(root["columns"])
_, r_end = get_start_end_number(root["rows"], root_idx)
if f_start == r_end + 1:
root["rows"].extend(frag["rows"])
break # fragment hanya boleh nempel ke 1 root
return roots
def read_pdf(path: str, page: str): def read_pdf(path: str, page: str):
""" """
@ -90,6 +125,74 @@ def read_pdf(path: str, page: str):
Raises: Raises:
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF. PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
""" """
# try:
# pdf_path = path
# selectedPage = page if page else "1"
# tables_data = []
# with pdfplumber.open(pdf_path) as pdf:
# total_pages = len(pdf.pages)
# selected_pages = parse_page_selection(selectedPage, total_pages)
# logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
# logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
# logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
# for page_num in selected_pages:
# pdf_page = pdf.pages[page_num - 1]
# tables = pdf_page.find_tables()
# logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
# # pembacaan title ini tidak valid untuk halaman lanscape
# # for line in pdf_page.extract_text_lines():
# # if line['top'] > tables[0].bbox[1]:
# # break
# # previous_line = line
# # print('[TITLE]', previous_line['text'])
# for i, t in enumerate(tables, start=1):
# table = t.extract()
# if len(table) > 2:
# print(f"[TBL] tabel : {i} - halaman {page_num}")
# tables_data.append(table)
# logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
# header_only, body_only = [], []
# for tbl in tables_data:
# head, body = detect_header_rows(tbl)
# header_only.append(head)
# body_only.append(body)
# clean_header = [merge_multiline_header(h) for h in header_only]
# clean_body = []
# for i, raw_body in enumerate(body_only):
# con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
# cleaned = cleaning_column(clean_header[i], [con_body])
# clean_body.append(cleaned[0])
# parsed = []
# for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
# parsed.append({
# "title": str(i),
# "columns": cols,
# "rows": rows
# })
# # =================================================================
# clean_parsed = filter_geo_admin_column(parsed)
# merge_parsed = merge_parsed_table(clean_parsed)
# logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
# ordered_tables = [normalize_number_column(t) for t in merge_parsed]
# return ordered_tables
# except Exception as e:
# raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
try: try:
pdf_path = path pdf_path = path
selectedPage = page if page else "1" selectedPage = page if page else "1"
@ -99,26 +202,36 @@ def read_pdf(path: str, page: str):
total_pages = len(pdf.pages) total_pages = len(pdf.pages)
selected_pages = parse_page_selection(selectedPage, total_pages) selected_pages = parse_page_selection(selectedPage, total_pages)
logger.info(f"[INFO] Total halaman PDF: {total_pages}") logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}") logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
for page_num in selected_pages: for page_num in selected_pages:
pdf_page = pdf.pages[page_num - 1] pdf_page = pdf.pages[page_num - 1]
tables = pdf_page.find_tables() tables = pdf_page.find_tables()
logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi") logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
for t in tables: # pembacaan title ini tidak valid untuk halaman lanscape
# for line in pdf_page.extract_text_lines():
# if line['top'] > tables[0].bbox[1]:
# break
# previous_line = line
# print('[TITLE]', previous_line['text'])
for i, t in enumerate(tables, start=1):
table = t.extract() table = t.extract()
if len(table) > 2: if len(table) > 2:
tables_data.append(table) print(f"[TBL] tabel : {i} - halaman {page_num}")
tables_data.append({"page": f"halaman {page_num} - {i}", "table": table})
logger.info(f"\nTotal tabel valid: {len(tables_data)}\n") logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
header_only, body_only = [], [] header_only, body_only, page_info = [], [], []
for tbl in tables_data: for tbl in tables_data:
head, body = detect_header_rows(tbl) head, body = detect_header_rows(tbl["table"])
header_only.append(head) header_only.append(head)
body_only.append(body) body_only.append(body)
page_info.append(tbl["page"])
clean_header = [merge_multiline_header(h) for h in header_only] clean_header = [merge_multiline_header(h) for h in header_only]
clean_body = [] clean_body = []
@ -129,15 +242,22 @@ def read_pdf(path: str, page: str):
clean_body.append(cleaned[0]) clean_body.append(cleaned[0])
parsed = [] parsed = []
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1): for i, (cols, rows, page) in enumerate(zip(clean_header, clean_body, page_info), start=1):
parsed.append({ parsed.append({
"title": str(i), "title": page,
"columns": cols, "columns": cols,
"rows": rows "rows": rows
}) })
# =================================================================
clean_parsed = filter_geo_admin_column(parsed) clean_parsed = filter_geo_admin_column(parsed)
return clean_parsed merge_parsed = merge_parsed_table(clean_parsed)
logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
ordered_tables = [normalize_number_column(t) for t in merge_parsed]
return ordered_tables
except Exception as e: except Exception as e:
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422) raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)

View File

@ -10,17 +10,16 @@ import asyncio
from pyproj import CRS from pyproj import CRS
from shapely.geometry.base import BaseGeometry from shapely.geometry.base import BaseGeometry
from shapely.geometry import base as shapely_base from shapely.geometry import base as shapely_base
from fastapi import File, Form, UploadFile, HTTPException from fastapi import Depends, File, Form, UploadFile, HTTPException
from api.routers.datasets_router import cleansing_data from api.routers.datasets_router import cleansing_data, publish_layer, query_cleansing_data, upload_to_main
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES, GEONETWORK_URL
from services.upload_file.ai_generate import send_metadata from services.upload_file.ai_generate import send_metadata
from services.upload_file.readers.reader_csv import read_csv from services.upload_file.readers.reader_csv import read_csv
from services.upload_file.readers.reader_shp import read_shp from services.upload_file.readers.reader_shp import read_shp
from services.upload_file.readers.reader_gdb import read_gdb from services.upload_file.readers.reader_gdb import read_gdb
from services.upload_file.readers.reader_mpk import read_mpk from services.upload_file.readers.reader_mpk import read_mpk
from services.upload_file.readers.reader_pdf import convert_df, read_pdf from services.upload_file.readers.reader_pdf import convert_df, read_pdf
from services.upload_file.utils.geometry_detector import detect_and_build_geometry from services.upload_file.utils.geometry_detector import detect_and_build_geometry, attach_polygon_geometry_auto
from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
from services.upload_file.upload_ws import report_progress from services.upload_file.upload_ws import report_progress
from database.connection import engine, sync_engine from database.connection import engine, sync_engine
from database.models import Base from database.models import Base
@ -309,7 +308,7 @@ def process_data(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
} }
ai_suggest = send_metadata(ai_context) ai_suggest = send_metadata(ai_context)
# ai_suggest = {'judul': 'Peta Risiko Letusan Gunung Arjuna di Provinsi Jawa Timur', 'abstrak': 'Peta ini menggambarkan wilayah berisiko letusan Gunung Arjuna yang berada di Provinsi Jawa Timur. Data disajikan dalam bentuk poligon yang menunjukkan zona risiko berdasarkan analisis potensi aktivitas vulkanik.', 'tujuan': 'Data dapat digunakan untuk perencanaan mitigasi bencana dan pengambilan keputusan di wilayah Jawa Timur.', 'keyword': ['Risiko letusan', 'Gunung Arjuna', 'Bencana alam', 'Provinsi Jawa Timur', 'Geologi'], 'kategori': ['Geoscientific information', 'Environment'], 'kategori_mapset': 'Lingkungan Hidup'} # ai_suggest = {'judul': 'Peta Risiko Letusan Gunung Arjuna di Provinsi Jawa Timur', 'abstrak': 'Peta ini menggambarkan wilayah berisiko letusan Gunung Arjuna yang berada di Provinsi Jawa Timur. Data disajikan dalam bentuk poligon yang menunjukkan zona risiko berdasarkan analisis potensi aktivitas vulkanik.', 'tujuan': 'Data dapat digunakan untuk perencanaan mitigasi bencana dan pengambilan keputusan di wilayah Jawa Timur.', 'keyword': ['Risiko letusan', 'Gunung Arjuna', 'Bencana alam', 'Provinsi Jawa Timur', 'Geologi'], 'kategori': ['Geoscientific information', 'Environment'], 'kategori_mapset': 'Lingkungan Hidup'}
print(ai_suggest) # print(ai_suggest)
response = { response = {
"message": "File berhasil dibaca dan dianalisis.", "message": "File berhasil dibaca dan dianalisis.",
@ -363,11 +362,11 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
tbl = read_pdf(tmp_path, page) tbl = read_pdf(tmp_path, page)
if len(tbl) == 0: if len(tbl) == 0:
res = { res = {
"message": "Tidak ditemukan tabel valid", "message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
"tables": {}, "tables": {},
"file_type": ext "file_type": ext
} }
return successRes(message="Tidak ditemukan tabel valid", data=res) return successRes(message="Tidak ditemukan tabel valid pada halaman yang dipilih", data=res)
elif len(tbl) > 1: elif len(tbl) > 1:
res = { res = {
"message": "File berhasil dibaca dan dianalisis.", "message": "File berhasil dibaca dan dianalisis.",
@ -389,10 +388,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
df = read_gdb(str(tmp_path)) df = read_gdb(str(tmp_path))
else: else:
raise errorRes( return successRes(message="ZIP file tidak mengandung SHP / GDB valid.")
status_code=400,
message="ZIP file tidak mengandung SHP atau GDB yang valid."
)
else: else:
raise errorRes(status_code=400, message="Unsupported file type") raise errorRes(status_code=400, message="Unsupported file type")
@ -680,17 +676,53 @@ async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
job_id = generate_job_id(str(user_id)) job_id = generate_job_id(str(user_id))
result = { result = {
"job_id": job_id, "job_id": job_id,
"job_status": "wait",
"table_name": table_name, "table_name": table_name,
"status": "success", "status": "success",
"message": f"Tabel '{table_name}' berhasil dibuat.", "message": f"Tabel '{table_name}' berhasil dibuat.",
"total_rows": len(gdf), "total_rows": len(gdf),
"geometry_type": unified_geom_type, "geometry_type": unified_geom_type,
"crs": detected_crs, "crs": detected_crs,
"metadata_uuid": ""
} }
save_xml_to_sld(payload.style, job_id) save_xml_to_sld(payload.style, job_id)
await report_progress(job_id, "upload", 20, "Upload selesai") await report_progress(job_id, "upload", 20, "Upload selesai")
cleansing_data(table_name, job_id) # cleansing_data(table_name, job_id)
cleansing = await query_cleansing_data(table_name)
result['job_status'] = cleansing
publish = await publish_layer(table_name, job_id)
result['metadata_uuid'] = publish['uuid']
mapset = {
"name": payload.title,
"description": author.get("abstract"),
"scale": "1:25000",
"projection_system_id": "0196c746-d1ba-7f1c-9706-5df738679cc7",
"category_id": author.get("mapsetCategory"),
"data_status": "sementara",
"classification_id": "01968b4b-d3f9-76c9-888c-ee887ac31ce4",
"producer_id": "019bd4ea-eb33-704e-83c3-8253d457b187",
"layer_type": unified_geom_type[0],
"source_id": ["019bd4e7-3df8-75c8-9b89-3f310967649c"],
"layer_url": publish['geos_link'],
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish['uuid']}",
"coverage_level": "provinsi",
"coverage_area": "kabupaten",
"data_update_period": "Tahunan",
"data_version": "2026",
"is_popular": False,
"is_active": True,
"regional_id": "01968b53-a910-7a67-bd10-975b8923b92e",
"notes": "Mapset baru dibuat",
"status_validation": "on_verification",
}
print("mapset data",mapset)
await upload_to_main(mapset)
return successRes(data=result) return successRes(data=result)
except Exception as e: except Exception as e:
@ -705,6 +737,53 @@ async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
# async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
# try:
# job_id = generate_job_id(str(user_id))
# result = {
# "job_id": job_id,
# "job_status": "done",
# "table_name": "just for test",
# "status": "success",
# "message": f"Tabel test berhasil dibuat.",
# "total_rows": 10,
# "geometry_type": "Polygon",
# "crs": "EPSG 4326",
# "metadata_uuid": "-"
# }
# mapset = {
# "name": "Resiko Letusan Gunung Arjuno",
# "description": "Testing Automation Upload",
# "scale": "1:25000",
# "projection_system_id": "0196c746-d1ba-7f1c-9706-5df738679cc7",
# "category_id": "0196c80c-855f-77f9-abd0-0c8a30b8c2f5",
# "data_status": "sementara",
# "classification_id": "01968b4b-d3f9-76c9-888c-ee887ac31ce4",
# "producer_id": "019bd4ea-eb33-704e-83c3-8253d457b187",
# "layer_type": "polygon",
# "source_id": ["019bd4e7-3df8-75c8-9b89-3f310967649c"],
# "layer_url": "http://192.168.60.24:8888/geoserver/wms?service=WMS&version=1.1.0&request=GetMap&layers=labai:risiko_letusan_gunung_arjuno_bromo&bbox=110.89528623700005,-8.780412043999945,116.26994997700001,-5.042971664999925&width=768&height=534&srs=EPSG:4326&styles=&format=application/openlayers",
# "metadata_url": "http://192.168.60.24:7777/geonetwork/srv/eng/catalog.search#/metadata/9e5e2f09-13ef-49b5-bb49-1cb12136f63b",
# "coverage_level": "provinsi",
# "coverage_area": "kabupaten",
# "data_update_period": "Tahunan",
# "data_version": "2026",
# "is_popular": False,
# "is_active": True,
# "regional_id": "01968b53-a910-7a67-bd10-975b8923b92e",
# "notes": "Mapset baru dibuat",
# "status_validation": "on_verification",
# }
# await upload_to_main(mapset)
# return successRes(data=result)
# except Exception as e:
# print("errot", e)

View File

@ -157,3 +157,52 @@ def is_short_text_row(row):
text_only = all(not is_number(c) for c in non_empty) text_only = all(not is_number(c) for c in non_empty)
joined = " ".join(non_empty) joined = " ".join(non_empty)
return text_only and len(non_empty) <= 2 and len(joined) < 20 return text_only and len(non_empty) <= 2 and len(joined) < 20
def get_number_column_index(columns):
for i, col in enumerate(columns):
if has_number_header(col):
return i
return None
def get_start_end_number(rows, idx):
try:
start_no = int(rows[0][idx])
end_no = int(rows[-1][idx])
return start_no, end_no
except:
return None, None
def normalize_number_column(table):
columns = table["columns"]
rows = table["rows"]
num_idx = get_number_column_index(columns)
if num_idx is None:
return table
current = None
for row in rows:
try:
val = int(row[num_idx])
except:
continue
if current is None:
current = val
else:
if val <= current:
current += 1
else:
current = val
row[num_idx] = str(current)
return table