update automation service
This commit is contained in:
parent
466820e1f6
commit
470c87f8c4
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -16,3 +16,6 @@ uploads/
|
|||
scrapp/
|
||||
logs/
|
||||
style_temp/
|
||||
services/styles/
|
||||
|
||||
cleansing_func.sql
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
import asyncio
|
||||
from uuid import uuid4
|
||||
from fastapi import APIRouter, HTTPException
|
||||
import httpx
|
||||
import requests
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from database.connection import engine
|
||||
from services.datasets.delete import delete_dataset_from_partition # import fungsi di atas
|
||||
from response import successRes, errorRes
|
||||
|
|
@ -10,7 +12,7 @@ from services.datasets.publish_geonetwork import publish_metadata
|
|||
from services.datasets.publish_geoserver import publish_layer_to_geoserver
|
||||
from services.datasets.metadata import update_job_status
|
||||
from services.upload_file.upload_ws import report_progress
|
||||
from core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, QGIS_URL
|
||||
from core.config import GEOSERVER_URL, GEOSERVER_USER, GEOSERVER_PASS, QGIS_URL, MAIN_API_URL, SERVICE_KEY
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
|
@ -207,3 +209,110 @@ def get_style(style_name: str, workspace: str = None):
|
|||
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise HTTPException(status_code=500, detail=f"Request error: {str(e)}")
|
||||
|
||||
|
||||
# =============================================================
|
||||
# cleansing query
|
||||
# =============================================================
|
||||
|
||||
# async def query_cleansing_data_fn(
|
||||
# table_name: str
|
||||
# ):
|
||||
# try:
|
||||
# async with engine.begin() as conn:
|
||||
# await conn.execute(
|
||||
# text("SELECT public.fn_cleansing_satupeta_polygon(:table_name)"),
|
||||
# {"table_name": table_name}
|
||||
# )
|
||||
# return "done"
|
||||
|
||||
# except SQLAlchemyError as e:
|
||||
# raise RuntimeError(f"Fix geometry failed: {str(e)}")
|
||||
|
||||
async def query_cleansing_data(
|
||||
table_name: str
|
||||
):
|
||||
try:
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(
|
||||
text("CALL pr_cleansing_satupeta_polygon(:table_name, NULL);"),
|
||||
{"table_name": table_name}
|
||||
)
|
||||
return "done"
|
||||
|
||||
except SQLAlchemyError as e:
|
||||
raise RuntimeError(f"Fix geometry failed: {str(e)}")
|
||||
|
||||
|
||||
|
||||
async def publish_layer(table_name: str, job_id: str):
|
||||
# await asyncio.sleep(10)
|
||||
try:
|
||||
await report_progress(job_id, "cleansing", 50, "Cleansing data selesai")
|
||||
# await asyncio.sleep(5)
|
||||
|
||||
geos_link = publish_layer_to_geoserver(table_name, job_id)
|
||||
await report_progress(job_id, "publish_geoserver", 80, "Publish GeoServer selesai")
|
||||
# await asyncio.sleep(3)
|
||||
|
||||
uuid = publish_metadata(
|
||||
table_name=table_name,
|
||||
geoserver_links=geos_link
|
||||
)
|
||||
await report_progress(job_id, "done", 100, "Publish GeoNetwork selesai")
|
||||
|
||||
update_job_status(table_name, "FINISHED", job_id)
|
||||
# return uuid
|
||||
return {
|
||||
"geos_link": geos_link["layer_url"],
|
||||
"uuid": uuid
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
update_job_status(table_name, "FAILED", job_id)
|
||||
raise RuntimeError(f"Publish layer gagal: {e}") from e
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
async def upload_to_main(payload: dict):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
response = await client.post(
|
||||
MAIN_API_URL+"/api/internal/mapsets",
|
||||
json=payload,
|
||||
headers={
|
||||
"X-SERVICE-KEY": SERVICE_KEY
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
print("GOOOOO")
|
||||
return {
|
||||
"status": "success",
|
||||
"data": response.json()
|
||||
}
|
||||
|
||||
except httpx.RequestError as e:
|
||||
# error koneksi, DNS, timeout, dll
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Gagal connect ke MAIN_API: {str(e)}"
|
||||
)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
# API tujuan balas tapi error (4xx / 5xx)
|
||||
raise HTTPException(
|
||||
status_code=e.response.status_code,
|
||||
detail=f"MAIN_API error: {e.response.text}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# fallback kalau ada error lain
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Unexpected error: {str(e)}"
|
||||
)
|
||||
|
|
@ -6,6 +6,9 @@ load_dotenv()
|
|||
|
||||
API_VERSION = "2.1.3"
|
||||
|
||||
MAIN_API_URL = os.getenv("MAIN_API_URL")
|
||||
SERVICE_KEY = os.getenv("SERVICE_KEY")
|
||||
|
||||
POSTGIS_URL = os.getenv("POSTGIS_URL")
|
||||
POSTGIS_SYNC_URL = os.getenv("SYNC_URL")
|
||||
|
||||
|
|
@ -26,6 +29,7 @@ UPLOAD_FOLDER = Path(os.getenv("UPLOAD_FOLDER", "./uploads"))
|
|||
MAX_FILE_MB = int(os.getenv("MAX_FILE_MB", 30))
|
||||
|
||||
ALLOWED_ORIGINS = [
|
||||
"http://localhost:4000",
|
||||
"http://localhost:3000",
|
||||
"http://127.0.0.1:3000",
|
||||
"http://localhost:5173",
|
||||
|
|
|
|||
|
|
@ -24,9 +24,9 @@ async def loginService(username: str, password: str, db: AsyncSession):
|
|||
if not pwd_context.verify(password, user.password_hash):
|
||||
raise errorRes(status_code=401, message="Invalid username or password")
|
||||
|
||||
# Validation for institution user
|
||||
if user.role != "admin" and not user.institution_id:
|
||||
raise errorRes(status_code=403, message="User must belong to an institution")
|
||||
# Validation for organization user
|
||||
if user.role != "admin" and not user.organization_id:
|
||||
raise errorRes(status_code=403, message="User must belong to an organization")
|
||||
|
||||
# Generate single active token
|
||||
token = str(uuid4())
|
||||
|
|
@ -41,7 +41,7 @@ async def loginService(username: str, password: str, db: AsyncSession):
|
|||
"status": "success",
|
||||
"username": user.username,
|
||||
"role": user.role,
|
||||
"institution_id": user.institution_id,
|
||||
"organization_id": user.organization_id,
|
||||
"token": token,
|
||||
"token_expired_at": expiry.isoformat()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ from database.connection import sync_engine
|
|||
from utils.logger_config import log_activity
|
||||
|
||||
def update_job_status(table_name: str, status: str, job_id: str = None):
|
||||
print("update status")
|
||||
query = text("""
|
||||
UPDATE backend.author_metadata
|
||||
SET process = :status,
|
||||
|
|
|
|||
|
|
@ -7,6 +7,22 @@ from datetime import datetime
|
|||
from uuid import uuid4
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def create_gn_session():
|
||||
session = requests.Session()
|
||||
session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
|
||||
|
||||
session.get(f"{GEONETWORK_URL}/srv/eng/info?type=me")
|
||||
xsrf_token = session.cookies.get("XSRF-TOKEN")
|
||||
|
||||
if not xsrf_token:
|
||||
raise Exception("XSRF token missing")
|
||||
|
||||
return session, xsrf_token
|
||||
|
||||
|
||||
|
||||
def escape_url_params(url: str) -> str:
|
||||
"""
|
||||
Escape karakter berbahaya di dalam URL agar valid dalam XML.
|
||||
|
|
@ -73,30 +89,6 @@ def get_extent(table_name: str):
|
|||
"ymax": -5.4819 # north
|
||||
}
|
||||
|
||||
|
||||
# def get_author_metadata(table_name: str):
|
||||
|
||||
# sql = """
|
||||
# SELECT table_title, dataset_title, dataset_abstract, keywords, date_created,
|
||||
# organization_name, contact_person_name,
|
||||
# contact_email, contact_phone, geom_type
|
||||
# FROM backend.author_metadata
|
||||
# WHERE table_title = :table
|
||||
# LIMIT 1
|
||||
# """
|
||||
|
||||
# conn = engine.connect()
|
||||
# try:
|
||||
# row = conn.execute(text(sql), {"table": table_name}).fetchone()
|
||||
# finally:
|
||||
# conn.close()
|
||||
|
||||
# if not row:
|
||||
# raise Exception(f"Tidak ada metadata untuk tabel: {table_name}")
|
||||
|
||||
# # FIX: SQLAlchemy Row → dict
|
||||
# return dict(row._mapping)
|
||||
|
||||
def get_author_metadata(table_name: str):
|
||||
|
||||
sql = """
|
||||
|
|
@ -356,10 +348,10 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
|
|||
<gmd:pointOfContact>
|
||||
<gmd:CI_ResponsibleParty>
|
||||
<gmd:individualName>
|
||||
<gco:CharacterString>Dinas Tenaga Kerja dan Transmigrasi Provinsi Jawa Timur</gco:CharacterString>
|
||||
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
|
||||
</gmd:individualName>
|
||||
<gmd:organisationName>
|
||||
<gco:CharacterString>Dinas Tenaga Kerja dan Transmigrasi Provinsi Jawa Timur</gco:CharacterString>
|
||||
<gco:CharacterString>Lab AI Polinema</gco:CharacterString>
|
||||
</gmd:organisationName>
|
||||
<gmd:positionName gco:nilReason="missing"/>
|
||||
<gmd:contactInfo>
|
||||
|
|
@ -510,9 +502,7 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
|
|||
<gmd:onLine>
|
||||
<gmd:CI_OnlineResource>
|
||||
<gmd:linkage>
|
||||
<gmd:URL>
|
||||
https://geoserver.jatimprov.go.id/wms?service=WMS&version=1.1.0&request=GetMap&layers=satupeta:fngsl_trw_1_4_2020_2023&bbox=110.89527893066406,-8.78022289276123,116.27019500732422,-5.042964935302734&width=768&height=534&srs=EPSG:4326&styles=&format=application/openlayers
|
||||
</gmd:URL>
|
||||
<gmd:URL>{geoserver_links["wms_url"]}</gmd:URL>
|
||||
</gmd:linkage>
|
||||
<gmd:protocol>
|
||||
<gco:CharacterString>WWW:LINK-1.0-http--link</gco:CharacterString>
|
||||
|
|
@ -578,21 +568,23 @@ def generate_metadata_xml(table_name, meta, extent, geoserver_links):
|
|||
"""
|
||||
|
||||
|
||||
# Geonetwork version 4.4.9.0
|
||||
def upload_metadata_to_geonetwork(xml_metadata: str):
|
||||
session = requests.Session()
|
||||
session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
|
||||
# session = requests.Session()
|
||||
# session.auth = (GEONETWORK_USER, GEONETWORK_PASS)
|
||||
|
||||
# 1. Get XSRF token
|
||||
try:
|
||||
info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
|
||||
session.get(info_url)
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
|
||||
# # 1. Get XSRF token
|
||||
# try:
|
||||
# info_url = f"{GEONETWORK_URL}/srv/eng/info?type=me"
|
||||
# session.get(info_url)
|
||||
# except requests.exceptions.RequestException as e:
|
||||
# raise HTTPException(status_code=503, detail=f"Failed to connect to GeoNetwork: {e}")
|
||||
|
||||
xsrf_token = session.cookies.get('XSRF-TOKEN')
|
||||
if not xsrf_token:
|
||||
raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
|
||||
# xsrf_token = session.cookies.get('XSRF-TOKEN')
|
||||
# if not xsrf_token:
|
||||
# raise HTTPException(status_code=500, detail="Could not retrieve XSRF-TOKEN from GeoNetwork.")
|
||||
|
||||
session, xsrf_token = create_gn_session()
|
||||
headers = {
|
||||
'X-XSRF-TOKEN': xsrf_token,
|
||||
'Accept': 'application/json'
|
||||
|
|
@ -605,15 +597,32 @@ def upload_metadata_to_geonetwork(xml_metadata: str):
|
|||
'file': ('metadata.xml', xml_metadata, 'application/xml')
|
||||
}
|
||||
|
||||
params = {
|
||||
"ownerGroup": 1, # all
|
||||
"ownerUser": 1 # admin
|
||||
}
|
||||
|
||||
response = session.post(
|
||||
GN_API_RECORDS_URL,
|
||||
params=params,
|
||||
files=files,
|
||||
headers=headers,
|
||||
cookies=session.cookies.get_dict()
|
||||
)
|
||||
|
||||
# print("response", response.json())
|
||||
return response.json()
|
||||
metadata_infos = response.json().get("metadataInfos", {})
|
||||
uuid = None
|
||||
for records in metadata_infos.values():
|
||||
if records and isinstance(records, list):
|
||||
uuid = records[0].get("uuid")
|
||||
break
|
||||
if not uuid:
|
||||
raise ValueError("UUID not found in GeoNetwork response")
|
||||
|
||||
publish_record(session, uuid)
|
||||
|
||||
# print("response", response.json())
|
||||
return uuid
|
||||
|
||||
|
||||
|
||||
|
|
@ -629,11 +638,59 @@ def publish_metadata(table_name: str, geoserver_links: dict):
|
|||
)
|
||||
|
||||
xml_clean = fix_xml_urls(xml)
|
||||
response = upload_metadata_to_geonetwork(xml_clean)
|
||||
uuid = upload_metadata_to_geonetwork(xml_clean)
|
||||
|
||||
uuid = response.get("uuid")
|
||||
print(f"[GeoNetwork] Metadata uploaded. UUID = {uuid}")
|
||||
|
||||
return uuid
|
||||
|
||||
|
||||
|
||||
def publish_record(session, uuid):
|
||||
print('[uuid]', uuid)
|
||||
xsrf_token = session.cookies.get('XSRF-TOKEN')
|
||||
|
||||
headers = {
|
||||
"X-XSRF-TOKEN": xsrf_token,
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
|
||||
|
||||
payload = {
|
||||
"clear": True,
|
||||
"privileges": [
|
||||
{
|
||||
"group": 1,
|
||||
"operations": {
|
||||
"view": True
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
response = session.put(url, json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# single stand func
|
||||
# def publish_record(uuid):
|
||||
# session, xsrf_token = create_gn_session()
|
||||
|
||||
# headers = {
|
||||
# "X-XSRF-TOKEN": xsrf_token,
|
||||
# "Content-Type": "application/json"
|
||||
# }
|
||||
|
||||
# url = f"{GEONETWORK_URL}/srv/api/records/{uuid}/sharing"
|
||||
|
||||
# payload = {
|
||||
# "clear": True,
|
||||
# "privileges": [
|
||||
# {"group": 1, "operations": {"view": True}}
|
||||
# ]
|
||||
# }
|
||||
|
||||
# resp = session.put(url, json=payload, headers=headers)
|
||||
# resp.raise_for_status()
|
||||
|
|
|
|||
|
|
@ -52,18 +52,52 @@ def publish_layer_to_geoserver(table: str, job_id: str):
|
|||
print(f"[WARNING] SLD file tidak ditemukan: {sld_file}")
|
||||
else:
|
||||
print(f"[GeoServer] Upload SLD {sld_file}")
|
||||
style_url = f"{GEOSERVER_URL}/rest/styles"
|
||||
|
||||
with open(sld_file, "rb") as sld:
|
||||
requests.post(
|
||||
f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
data=sld.read()
|
||||
#old
|
||||
# style_url = f"{GEOSERVER_URL}/rest/styles"
|
||||
|
||||
# with open(sld_file, "rb") as sld:
|
||||
# requests.post(
|
||||
# f"{style_url}?name={style_name}&workspace={GEOSERVER_WORKSPACE}",
|
||||
# auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
# headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
# data=sld.read()
|
||||
# )
|
||||
|
||||
# print(f"[GeoServer] SLD uploaded: {style_name}")
|
||||
|
||||
|
||||
|
||||
#new
|
||||
style_url = (
|
||||
f"{GEOSERVER_URL}/rest/workspaces/"
|
||||
f"{GEOSERVER_WORKSPACE}/styles"
|
||||
)
|
||||
|
||||
with open(sld_file, "r", encoding="utf-8") as f:
|
||||
sld_content = f.read()
|
||||
|
||||
# 🔥 INI BARIS PENTINGNYA
|
||||
sld_content = sld_content.lstrip("\ufeff \t\r\n")
|
||||
|
||||
resp = requests.post(
|
||||
f"{style_url}?name={style_name}",
|
||||
auth=(GEOSERVER_USER, GEOSERVER_PASS),
|
||||
headers={"Content-Type": "application/vnd.ogc.sld+xml"},
|
||||
data=sld_content.encode("utf-8")
|
||||
)
|
||||
|
||||
|
||||
if resp.status_code not in (200, 201):
|
||||
raise Exception(
|
||||
f"Upload SLD gagal ({resp.status_code}): {resp.text}"
|
||||
)
|
||||
|
||||
print(f"[GeoServer] SLD uploaded: {style_name}")
|
||||
|
||||
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 3. Apply SLD to the layer
|
||||
# ==========================================
|
||||
|
|
@ -116,14 +150,29 @@ def publish_layer_to_geoserver(table: str, job_id: str):
|
|||
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wfs?"
|
||||
f"service=WFS&request=GetFeature&typeName={GEOSERVER_WORKSPACE}:{table}"
|
||||
)
|
||||
print(f"[GeoServer] WMS URL: {wms_link}")
|
||||
print(f"[GeoServer] WFS URL: {wfs_link}")
|
||||
print(f"[GeoServer] Reload completed. Layer {table} ready.")
|
||||
# print(f"[GeoServer] WMS URL: {wms_link}")
|
||||
# print(f"[GeoServer] WFS URL: {wfs_link}")
|
||||
# print(f"[GeoServer] Reload completed. Layer {table} ready.")
|
||||
openlayer_url = (
|
||||
f"{GEOSERVER_URL}/{GEOSERVER_WORKSPACE}/wms?"
|
||||
f"service=WMS"
|
||||
f"&version=1.1.0"
|
||||
f"&request=GetMap"
|
||||
f"&layers={GEOSERVER_WORKSPACE}:{table}"
|
||||
f"&styles="
|
||||
f"&bbox=110.89528623700005%2C-8.780412043999945%2C116.26994997700001%2C-5.042971664999925"
|
||||
f"&width=768"
|
||||
f"&height=384"
|
||||
f"&srs=EPSG:4326"
|
||||
f"&format=application/openlayers"
|
||||
)
|
||||
|
||||
return {
|
||||
"table": table,
|
||||
"style": style_name,
|
||||
"wms_url": wms_link,
|
||||
"wfs_url": wfs_link
|
||||
"wfs_url": wfs_link,
|
||||
"layer_url": openlayer_url
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
from services.upload_file.utils.pdf_cleaner import row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
|
||||
from services.upload_file.utils.pdf_cleaner import get_number_column_index, get_start_end_number, normalize_number_column, row_ratio, has_mixed_text_and_numbers, is_short_text_row, parse_page_selection, filter_geo_admin_column, cleaning_column
|
||||
from services.upload_file.upload_exceptions import PDFReadError
|
||||
from utils.logger_config import setup_logger
|
||||
|
||||
|
|
@ -56,6 +56,41 @@ def merge_multiline_header(header_rows):
|
|||
final_header = [v for v in final_header if v not in ['', None]]
|
||||
return final_header
|
||||
|
||||
def merge_parsed_table(tables):
|
||||
roots = []
|
||||
fragments = []
|
||||
|
||||
# STEP 1: klasifikasi
|
||||
for table in tables:
|
||||
num_idx = get_number_column_index(table["columns"])
|
||||
if num_idx is None:
|
||||
roots.append(table)
|
||||
continue
|
||||
|
||||
start_no, _ = get_start_end_number(table["rows"], num_idx)
|
||||
if start_no == 1:
|
||||
roots.append(table)
|
||||
else:
|
||||
fragments.append(table)
|
||||
|
||||
# STEP 2: merge fragment ke root
|
||||
for frag in fragments:
|
||||
frag_idx = get_number_column_index(frag["columns"])
|
||||
f_start, _ = get_start_end_number(frag["rows"], frag_idx)
|
||||
|
||||
for root in roots:
|
||||
if root["columns"] != frag["columns"]:
|
||||
continue
|
||||
|
||||
root_idx = get_number_column_index(root["columns"])
|
||||
_, r_end = get_start_end_number(root["rows"], root_idx)
|
||||
|
||||
if f_start == r_end + 1:
|
||||
root["rows"].extend(frag["rows"])
|
||||
break # fragment hanya boleh nempel ke 1 root
|
||||
|
||||
return roots
|
||||
|
||||
|
||||
def read_pdf(path: str, page: str):
|
||||
"""
|
||||
|
|
@ -90,6 +125,74 @@ def read_pdf(path: str, page: str):
|
|||
Raises:
|
||||
PDFReadError: Jika terjadi kesalahan saat membaca atau parsing PDF.
|
||||
"""
|
||||
# try:
|
||||
# pdf_path = path
|
||||
# selectedPage = page if page else "1"
|
||||
# tables_data = []
|
||||
|
||||
# with pdfplumber.open(pdf_path) as pdf:
|
||||
# total_pages = len(pdf.pages)
|
||||
# selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||
|
||||
# logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
|
||||
# logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
|
||||
# logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||
|
||||
# for page_num in selected_pages:
|
||||
# pdf_page = pdf.pages[page_num - 1]
|
||||
# tables = pdf_page.find_tables()
|
||||
# logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||
|
||||
# # pembacaan title ini tidak valid untuk halaman lanscape
|
||||
# # for line in pdf_page.extract_text_lines():
|
||||
# # if line['top'] > tables[0].bbox[1]:
|
||||
# # break
|
||||
# # previous_line = line
|
||||
# # print('[TITLE]', previous_line['text'])
|
||||
|
||||
# for i, t in enumerate(tables, start=1):
|
||||
# table = t.extract()
|
||||
# if len(table) > 2:
|
||||
# print(f"[TBL] tabel : {i} - halaman {page_num}")
|
||||
# tables_data.append(table)
|
||||
|
||||
# logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
|
||||
|
||||
# header_only, body_only = [], []
|
||||
# for tbl in tables_data:
|
||||
# head, body = detect_header_rows(tbl)
|
||||
# header_only.append(head)
|
||||
# body_only.append(body)
|
||||
|
||||
# clean_header = [merge_multiline_header(h) for h in header_only]
|
||||
# clean_body = []
|
||||
|
||||
# for i, raw_body in enumerate(body_only):
|
||||
# con_body = [[cell for cell in row if cell not in (None, '')] for row in raw_body]
|
||||
# cleaned = cleaning_column(clean_header[i], [con_body])
|
||||
# clean_body.append(cleaned[0])
|
||||
|
||||
# parsed = []
|
||||
# for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||
# parsed.append({
|
||||
# "title": str(i),
|
||||
# "columns": cols,
|
||||
# "rows": rows
|
||||
# })
|
||||
|
||||
# # =================================================================
|
||||
|
||||
# clean_parsed = filter_geo_admin_column(parsed)
|
||||
# merge_parsed = merge_parsed_table(clean_parsed)
|
||||
|
||||
# logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
|
||||
|
||||
# ordered_tables = [normalize_number_column(t) for t in merge_parsed]
|
||||
# return ordered_tables
|
||||
|
||||
# except Exception as e:
|
||||
# raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
|
||||
|
||||
try:
|
||||
pdf_path = path
|
||||
selectedPage = page if page else "1"
|
||||
|
|
@ -99,26 +202,36 @@ def read_pdf(path: str, page: str):
|
|||
total_pages = len(pdf.pages)
|
||||
selected_pages = parse_page_selection(selectedPage, total_pages)
|
||||
|
||||
logger.info(f"[INFO] Total halaman PDF: {total_pages}")
|
||||
logger.info(f"[INFO] Total Halaman PDF: {total_pages}")
|
||||
logger.info(f"[INFO] Total Halaman yang dipilih: {len(selected_pages)}")
|
||||
logger.info(f"[INFO] Halaman yang dipilih untuk dibaca: {selected_pages}")
|
||||
|
||||
for page_num in selected_pages:
|
||||
pdf_page = pdf.pages[page_num - 1]
|
||||
tables = pdf_page.find_tables()
|
||||
logger.info(f"[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||
logger.info(f"\n\n[INFO] Halaman {page_num}: {len(tables)} tabel terdeteksi")
|
||||
|
||||
for t in tables:
|
||||
# pembacaan title ini tidak valid untuk halaman lanscape
|
||||
# for line in pdf_page.extract_text_lines():
|
||||
# if line['top'] > tables[0].bbox[1]:
|
||||
# break
|
||||
# previous_line = line
|
||||
# print('[TITLE]', previous_line['text'])
|
||||
|
||||
for i, t in enumerate(tables, start=1):
|
||||
table = t.extract()
|
||||
if len(table) > 2:
|
||||
tables_data.append(table)
|
||||
print(f"[TBL] tabel : {i} - halaman {page_num}")
|
||||
tables_data.append({"page": f"halaman {page_num} - {i}", "table": table})
|
||||
|
||||
logger.info(f"\nTotal tabel valid: {len(tables_data)}\n")
|
||||
logger.info(f"\nTotal tabel terbaca: {len(tables_data)}\n")
|
||||
|
||||
header_only, body_only = [], []
|
||||
header_only, body_only, page_info = [], [], []
|
||||
for tbl in tables_data:
|
||||
head, body = detect_header_rows(tbl)
|
||||
head, body = detect_header_rows(tbl["table"])
|
||||
header_only.append(head)
|
||||
body_only.append(body)
|
||||
page_info.append(tbl["page"])
|
||||
|
||||
clean_header = [merge_multiline_header(h) for h in header_only]
|
||||
clean_body = []
|
||||
|
|
@ -129,15 +242,22 @@ def read_pdf(path: str, page: str):
|
|||
clean_body.append(cleaned[0])
|
||||
|
||||
parsed = []
|
||||
for i, (cols, rows) in enumerate(zip(clean_header, clean_body), start=1):
|
||||
for i, (cols, rows, page) in enumerate(zip(clean_header, clean_body, page_info), start=1):
|
||||
parsed.append({
|
||||
"title": str(i),
|
||||
"title": page,
|
||||
"columns": cols,
|
||||
"rows": rows
|
||||
})
|
||||
|
||||
# =================================================================
|
||||
|
||||
clean_parsed = filter_geo_admin_column(parsed)
|
||||
return clean_parsed
|
||||
merge_parsed = merge_parsed_table(clean_parsed)
|
||||
|
||||
logger.info(f"\nTotal tabel valid: {len(merge_parsed)}\n")
|
||||
|
||||
ordered_tables = [normalize_number_column(t) for t in merge_parsed]
|
||||
return ordered_tables
|
||||
|
||||
except Exception as e:
|
||||
raise PDFReadError(f"Gagal membaca PDF: {e}", code=422)
|
||||
|
|
|
|||
|
|
@ -10,17 +10,16 @@ import asyncio
|
|||
from pyproj import CRS
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from shapely.geometry import base as shapely_base
|
||||
from fastapi import File, Form, UploadFile, HTTPException
|
||||
from api.routers.datasets_router import cleansing_data
|
||||
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES
|
||||
from fastapi import Depends, File, Form, UploadFile, HTTPException
|
||||
from api.routers.datasets_router import cleansing_data, publish_layer, query_cleansing_data, upload_to_main
|
||||
from core.config import UPLOAD_FOLDER, MAX_FILE_MB, VALID_WKT_PREFIXES, GEONETWORK_URL
|
||||
from services.upload_file.ai_generate import send_metadata
|
||||
from services.upload_file.readers.reader_csv import read_csv
|
||||
from services.upload_file.readers.reader_shp import read_shp
|
||||
from services.upload_file.readers.reader_gdb import read_gdb
|
||||
from services.upload_file.readers.reader_mpk import read_mpk
|
||||
from services.upload_file.readers.reader_pdf import convert_df, read_pdf
|
||||
from services.upload_file.utils.geometry_detector import detect_and_build_geometry
|
||||
from services.upload_file.utils.geometry_detector import attach_polygon_geometry_auto
|
||||
from services.upload_file.utils.geometry_detector import detect_and_build_geometry, attach_polygon_geometry_auto
|
||||
from services.upload_file.upload_ws import report_progress
|
||||
from database.connection import engine, sync_engine
|
||||
from database.models import Base
|
||||
|
|
@ -309,7 +308,7 @@ def process_data(df: pd.DataFrame, ext: str, filename: str, fileDesc: str):
|
|||
}
|
||||
ai_suggest = send_metadata(ai_context)
|
||||
# ai_suggest = {'judul': 'Peta Risiko Letusan Gunung Arjuna di Provinsi Jawa Timur', 'abstrak': 'Peta ini menggambarkan wilayah berisiko letusan Gunung Arjuna yang berada di Provinsi Jawa Timur. Data disajikan dalam bentuk poligon yang menunjukkan zona risiko berdasarkan analisis potensi aktivitas vulkanik.', 'tujuan': 'Data dapat digunakan untuk perencanaan mitigasi bencana dan pengambilan keputusan di wilayah Jawa Timur.', 'keyword': ['Risiko letusan', 'Gunung Arjuna', 'Bencana alam', 'Provinsi Jawa Timur', 'Geologi'], 'kategori': ['Geoscientific information', 'Environment'], 'kategori_mapset': 'Lingkungan Hidup'}
|
||||
print(ai_suggest)
|
||||
# print(ai_suggest)
|
||||
|
||||
response = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
|
|
@ -363,11 +362,11 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
|
|||
tbl = read_pdf(tmp_path, page)
|
||||
if len(tbl) == 0:
|
||||
res = {
|
||||
"message": "Tidak ditemukan tabel valid",
|
||||
"message": "Tidak ditemukan tabel valid pada halaman yang dipilih",
|
||||
"tables": {},
|
||||
"file_type": ext
|
||||
}
|
||||
return successRes(message="Tidak ditemukan tabel valid", data=res)
|
||||
return successRes(message="Tidak ditemukan tabel valid pada halaman yang dipilih", data=res)
|
||||
elif len(tbl) > 1:
|
||||
res = {
|
||||
"message": "File berhasil dibaca dan dianalisis.",
|
||||
|
|
@ -389,10 +388,7 @@ async def handle_upload_file(file: UploadFile = File(...), page: Optional[str] =
|
|||
df = read_gdb(str(tmp_path))
|
||||
|
||||
else:
|
||||
raise errorRes(
|
||||
status_code=400,
|
||||
message="ZIP file tidak mengandung SHP atau GDB yang valid."
|
||||
)
|
||||
return successRes(message="ZIP file tidak mengandung SHP / GDB valid.")
|
||||
else:
|
||||
raise errorRes(status_code=400, message="Unsupported file type")
|
||||
|
||||
|
|
@ -680,17 +676,53 @@ async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
|
|||
job_id = generate_job_id(str(user_id))
|
||||
result = {
|
||||
"job_id": job_id,
|
||||
"job_status": "wait",
|
||||
"table_name": table_name,
|
||||
"status": "success",
|
||||
"message": f"Tabel '{table_name}' berhasil dibuat.",
|
||||
"total_rows": len(gdf),
|
||||
"geometry_type": unified_geom_type,
|
||||
"crs": detected_crs,
|
||||
"metadata_uuid": ""
|
||||
}
|
||||
save_xml_to_sld(payload.style, job_id)
|
||||
|
||||
await report_progress(job_id, "upload", 20, "Upload selesai")
|
||||
cleansing_data(table_name, job_id)
|
||||
# cleansing_data(table_name, job_id)
|
||||
|
||||
cleansing = await query_cleansing_data(table_name)
|
||||
result['job_status'] = cleansing
|
||||
|
||||
publish = await publish_layer(table_name, job_id)
|
||||
result['metadata_uuid'] = publish['uuid']
|
||||
|
||||
mapset = {
|
||||
"name": payload.title,
|
||||
"description": author.get("abstract"),
|
||||
"scale": "1:25000",
|
||||
"projection_system_id": "0196c746-d1ba-7f1c-9706-5df738679cc7",
|
||||
"category_id": author.get("mapsetCategory"),
|
||||
"data_status": "sementara",
|
||||
"classification_id": "01968b4b-d3f9-76c9-888c-ee887ac31ce4",
|
||||
"producer_id": "019bd4ea-eb33-704e-83c3-8253d457b187",
|
||||
"layer_type": unified_geom_type[0],
|
||||
"source_id": ["019bd4e7-3df8-75c8-9b89-3f310967649c"],
|
||||
"layer_url": publish['geos_link'],
|
||||
"metadata_url": f"{GEONETWORK_URL}/srv/eng/catalog.search#/metadata/{publish['uuid']}",
|
||||
"coverage_level": "provinsi",
|
||||
"coverage_area": "kabupaten",
|
||||
"data_update_period": "Tahunan",
|
||||
"data_version": "2026",
|
||||
"is_popular": False,
|
||||
"is_active": True,
|
||||
"regional_id": "01968b53-a910-7a67-bd10-975b8923b92e",
|
||||
"notes": "Mapset baru dibuat",
|
||||
"status_validation": "on_verification",
|
||||
}
|
||||
|
||||
print("mapset data",mapset)
|
||||
await upload_to_main(mapset)
|
||||
|
||||
return successRes(data=result)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -705,6 +737,53 @@ async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
|
|||
|
||||
|
||||
|
||||
# async def handle_to_postgis(payload: UploadRequest, user_id: int = 2):
|
||||
# try:
|
||||
# job_id = generate_job_id(str(user_id))
|
||||
# result = {
|
||||
# "job_id": job_id,
|
||||
# "job_status": "done",
|
||||
# "table_name": "just for test",
|
||||
# "status": "success",
|
||||
# "message": f"Tabel test berhasil dibuat.",
|
||||
# "total_rows": 10,
|
||||
# "geometry_type": "Polygon",
|
||||
# "crs": "EPSG 4326",
|
||||
# "metadata_uuid": "-"
|
||||
# }
|
||||
|
||||
# mapset = {
|
||||
# "name": "Resiko Letusan Gunung Arjuno",
|
||||
# "description": "Testing Automation Upload",
|
||||
# "scale": "1:25000",
|
||||
# "projection_system_id": "0196c746-d1ba-7f1c-9706-5df738679cc7",
|
||||
# "category_id": "0196c80c-855f-77f9-abd0-0c8a30b8c2f5",
|
||||
# "data_status": "sementara",
|
||||
# "classification_id": "01968b4b-d3f9-76c9-888c-ee887ac31ce4",
|
||||
# "producer_id": "019bd4ea-eb33-704e-83c3-8253d457b187",
|
||||
# "layer_type": "polygon",
|
||||
# "source_id": ["019bd4e7-3df8-75c8-9b89-3f310967649c"],
|
||||
# "layer_url": "http://192.168.60.24:8888/geoserver/wms?service=WMS&version=1.1.0&request=GetMap&layers=labai:risiko_letusan_gunung_arjuno_bromo&bbox=110.89528623700005,-8.780412043999945,116.26994997700001,-5.042971664999925&width=768&height=534&srs=EPSG:4326&styles=&format=application/openlayers",
|
||||
# "metadata_url": "http://192.168.60.24:7777/geonetwork/srv/eng/catalog.search#/metadata/9e5e2f09-13ef-49b5-bb49-1cb12136f63b",
|
||||
# "coverage_level": "provinsi",
|
||||
# "coverage_area": "kabupaten",
|
||||
# "data_update_period": "Tahunan",
|
||||
# "data_version": "2026",
|
||||
# "is_popular": False,
|
||||
# "is_active": True,
|
||||
# "regional_id": "01968b53-a910-7a67-bd10-975b8923b92e",
|
||||
# "notes": "Mapset baru dibuat",
|
||||
# "status_validation": "on_verification",
|
||||
# }
|
||||
|
||||
# await upload_to_main(mapset)
|
||||
|
||||
# return successRes(data=result)
|
||||
|
||||
# except Exception as e:
|
||||
# print("errot", e)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -157,3 +157,52 @@ def is_short_text_row(row):
|
|||
text_only = all(not is_number(c) for c in non_empty)
|
||||
joined = " ".join(non_empty)
|
||||
return text_only and len(non_empty) <= 2 and len(joined) < 20
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_number_column_index(columns):
|
||||
for i, col in enumerate(columns):
|
||||
if has_number_header(col):
|
||||
return i
|
||||
return None
|
||||
|
||||
def get_start_end_number(rows, idx):
|
||||
try:
|
||||
start_no = int(rows[0][idx])
|
||||
end_no = int(rows[-1][idx])
|
||||
return start_no, end_no
|
||||
except:
|
||||
return None, None
|
||||
|
||||
def normalize_number_column(table):
|
||||
columns = table["columns"]
|
||||
rows = table["rows"]
|
||||
|
||||
num_idx = get_number_column_index(columns)
|
||||
if num_idx is None:
|
||||
return table
|
||||
|
||||
current = None
|
||||
|
||||
for row in rows:
|
||||
try:
|
||||
val = int(row[num_idx])
|
||||
except:
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
current = val
|
||||
else:
|
||||
if val <= current:
|
||||
current += 1
|
||||
else:
|
||||
current = val
|
||||
|
||||
row[num_idx] = str(current)
|
||||
|
||||
return table
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user