fixing cleansing data

This commit is contained in:
DmsAnhr 2025-11-29 11:44:08 +07:00
parent 421d0cf90b
commit 14852b6648
6 changed files with 317 additions and 647 deletions

3
.gitignore vendored
View File

@ -1,5 +1,8 @@
.env
test_pg.py test_pg.py
cleansing_service.py cleansing_service.py
postgis_metadata.py
database.py
__pycache__/ __pycache__/
data/ data/

13
core/config.py Normal file
View File

@ -0,0 +1,13 @@
from dotenv import load_dotenv
import os
load_dotenv()
HOST = os.getenv("host")
PORT = os.getenv("port")
DB = os.getenv("db")
USER = os.getenv("user")
PWD = os.getenv("pwd")
SCHEMA = os.getenv("schema")
GEOM_COL = os.getenv("geom_col")

View File

@ -1,21 +0,0 @@
POSTGIS = {
"host": "192.168.60.24",
"port": "5432",
"db": "test_postgis",
"user": "postgres",
"password": "12345"
}
def build_uri(table_name: str) -> str:
return (
f"dbname='{POSTGIS['db']}' "
f"host='{POSTGIS['host']}' "
f"port='{POSTGIS['port']}' "
f"user='{POSTGIS['user']}' "
f"password='{POSTGIS['password']}' "
f"sslmode=disable "
f"table=\"public\".\"{table_name}\" "
f"key='_id'"
)

View File

@ -1,598 +1,181 @@
from qgis.core import ( from qgis.core import (
QgsDataSourceUri,
QgsFeature, QgsFeature,
QgsVectorLayer, QgsVectorLayer,
QgsVectorLayerExporter, QgsVectorLayerExporter,
QgsVectorFileWriter QgsVectorFileWriter,
QgsWkbTypes
) )
import processing import processing
from typing import Dict from typing import Dict
from database import build_uri from core.config import HOST,PORT,DB,USER,PWD,SCHEMA,GEOM_COL
def load_layer(table_name: str): def load_layer(table_name: str):
uri = build_uri(table_name) uri = QgsDataSourceUri()
print('uri', uri) uri.setConnection(HOST, PORT, DB, USER, PWD)
layer = QgsVectorLayer(uri, table_name, "postgres") uri.setDataSource(SCHEMA, table_name, GEOM_COL, "", "_id")
layer = QgsVectorLayer(uri.uri(), table_name, "postgres")
print("Layer valid:", layer.isValid()) print("Layer valid:", layer.isValid())
return layer return layer
# def cleansing_layer(layer: QgsVectorLayer) -> Dict:
# summary = {
# "total_features_before": layer.featureCount(),
# "invalid_geometries_before": 0,
# "invalid_geometries_fixed": 0,
# "duplicates_removed": 0,
# "sliver_removed": 0,
# "holes_removed": 0
# }
# # ========================================================
# # 1. IDENTIFY INVALID GEOMETRY
# # ========================================================
# invalid_ids = []
# for f in layer.getFeatures():
# if not f.geometry().isGeosValid():
# invalid_ids.append(f.id())
# summary["invalid_geometries_before"] = len(invalid_ids)
# # ========================================================
# # 2. FIX GEOMETRIES
# # ========================================================
# fixed = processing.run(
# "native:fixgeometries",
# {
# "INPUT": layer,
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# summary["invalid_geometries_fixed"] = len(invalid_ids)
# # ========================================================
# # 3. ENSURE MULTIPOLYGON
# # ========================================================
# multipolygon = processing.run(
# "native:collect",
# {
# "INPUT": fixed,
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# # ========================================================
# # 4. REMOVE DUPLICATE ROWS
# # ========================================================
# all_fields = [f.name() for f in multipolygon.fields()]
# print("Detecting key fields:", all_fields)
# key_fields = None
# # (1) Prefer 'id'
# if "id" in all_fields:
# key_fields = ["id"]
# # (2) Else pick first integer field
# if key_fields is None:
# int_cols = [
# f.name() for f in multipolygon.fields()
# if f.typeName().lower() in ["int", "integer", "bigint"]
# ]
# if int_cols:
# key_fields = [int_cols[0]]
# # (3) Else use all fields
# if key_fields is None:
# key_fields = all_fields
# print("Using key field:", key_fields)
# dedup = processing.run(
# "native:removeduplicatesbyattribute",
# {
# "INPUT": multipolygon,
# "FIELDS": key_fields,
# "METHOD": 0,
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# summary["duplicates_removed"] = (
# multipolygon.featureCount() - dedup.featureCount()
# )
# # ========================================================
# # 5. REMOVE DUPLICATE VERTICES
# # ========================================================
# no_dup_vertices = processing.run(
# "native:removeduplicatevertices",
# {
# "INPUT": dedup,
# "VERTICES": 0, # remove exact duplicates
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# # ========================================================
# # 6. FIX SRID (REPROJECT IF NEEDED)
# # ========================================================
# # Force SRID to 4326
# reprojected = processing.run(
# "native:reprojectlayer",
# {
# "INPUT": no_dup_vertices,
# "TARGET_CRS": "EPSG:4326",
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# # ========================================================
# # 7. REMOVE SLIVER POLYGONS (< 1 m²)
# # ========================================================
# # Filter polygons with area < 1 (threshold bisa kamu ubah)
# slivers = processing.run(
# "native:extractbyexpression",
# {
# "INPUT": reprojected,
# "EXPRESSION": "$area < 1",
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# summary["sliver_removed"] = slivers.featureCount()
# # Keep only polygons with area >= 1
# no_sliver = processing.run(
# "native:extractbyexpression",
# {
# "INPUT": reprojected,
# "EXPRESSION": "$area >= 1",
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# # ========================================================
# # 8. REMOVE TINY HOLES (< 1 m²)
# # ========================================================
# no_holes = processing.run(
# "native:deleteholes",
# {
# "INPUT": no_sliver,
# "MIN_AREA": 1, # minimum area of hole to keep
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# summary["holes_removed"] = 0 # can't count holes easily in PyQGIS
# # ========================================================
# # 9. TRIM STRING FIELDS (ATTRIBUTE CLEANSING)
# # ========================================================
# trimmed = processing.run(
# "qgis:refactorfields",
# {
# "INPUT": no_holes,
# "FIELDS_MAPPING": [
# {
# "expression": f"trim(\"{field.name()}\")"
# if field.typeName().lower() in ["text", "varchar"]
# else f"\"{field.name()}\"",
# "name": field.name(),
# "type": field.type(),
# "length": field.length(),
# "precision": field.precision()
# }
# for field in no_holes.fields()
# ],
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# # ========================================================
# # RETURN CLEANED LAYER
# # ========================================================
# return {
# "summary": summary,
# "clean_layer": trimmed
# }
# def cleansing_layer(layer: QgsVectorLayer) -> Dict:
# # ========================================================
# # INITIAL STATE
# # ========================================================
# print("\n========== START CLEANSING ==========")
# print("Step 0: Load Layer")
# print(" - Valid:", layer.isValid())
# print(" - Feature Count:", layer.featureCount())
# summary = {
# "step0_features": layer.featureCount(),
# "step1_invalid_before": 0,
# "step2_after_fix": 0,
# "step3_after_multipolygon": 0,
# "step4_duplicates_removed": 0,
# "step5_after_remove_vertices": 0,
# "step6_after_srid": 0,
# "step7_sliver_removed": 0,
# "step8_after_deleteholes": 0
# }
# # ========================================================
# # 1. VALIDATE GEOMETRY
# # ========================================================
# print("\nStep 1: Identify invalid geometries")
# invalid_ids = []
# for f in layer.getFeatures():
# if not f.geometry().isGeosValid():
# invalid_ids.append(f.id())
# summary["step1_invalid_before"] = len(invalid_ids)
# print(" - Invalid geometries found:", len(invalid_ids))
# # ========================================================
# # 2. FIX GEOMETRIES
# # ========================================================
# print("\nStep 2: Fix geometries")
# fixed = processing.run(
# "native:fixgeometries",
# {"INPUT": layer, "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Valid:", fixed.isValid())
# print(" - Features after fix:", fixed.featureCount())
# summary["step2_after_fix"] = fixed.featureCount()
# # ========================================================
# # 3. ENSURE MULTIPOLYGON
# # ========================================================
# print("\nStep 3: Ensure MULTIPOLYGON")
# multipolygon = processing.run(
# "native:collect",
# {"INPUT": fixed, "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Valid:", multipolygon.isValid())
# print(" - Features:", multipolygon.featureCount())
# summary["step3_after_multipolygon"] = multipolygon.featureCount()
# # ========================================================
# # 4. REMOVE DUPLICATE ROWS
# # ========================================================
# print("\nStep 4: Remove duplicate rows")
# all_fields = [f.name() for f in multipolygon.fields()]
# print(" - All fields:", all_fields)
# key_fields = None
# if "id" in all_fields:
# key_fields = ["id"]
# else:
# int_cols = [
# f.name() for f in multipolygon.fields()
# if f.typeName().lower() in ["int", "integer", "bigint"]
# ]
# if int_cols:
# key_fields = [int_cols[0]]
# else:
# key_fields = all_fields
# print(" - Using duplicate key:", key_fields)
# dedup = processing.run(
# "native:removeduplicatesbyattribute",
# {"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"}
# )["OUTPUT"]
# duplicates_removed = multipolygon.featureCount() - dedup.featureCount()
# summary["step4_duplicates_removed"] = duplicates_removed
# print(" - Features before:", multipolygon.featureCount())
# print(" - Features after:", dedup.featureCount())
# print(" - Duplicates removed:", duplicates_removed)
# # ========================================================
# # 5. REMOVE DUPLICATE VERTICES
# # ========================================================
# print("\nStep 5: Remove duplicate vertices")
# no_dup_vertices = processing.run(
# "native:removeduplicatevertices",
# {"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Features:", no_dup_vertices.featureCount())
# summary["step5_after_remove_vertices"] = no_dup_vertices.featureCount()
# # ========================================================
# # 6. FIX SRID / REPROJECT
# # ========================================================
# print("\nStep 6: Reproject (Fix SRID to EPSG:4326)")
# reprojected = processing.run(
# "native:reprojectlayer",
# {"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Features:", reprojected.featureCount())
# summary["step6_after_srid"] = reprojected.featureCount()
# # ========================================================
# # 7. REMOVE SLIVER POLYGONS (< 1 m2)
# # ========================================================
# print("\nStep 7: Remove sliver polygons (<1 m²)")
# slivers = processing.run(
# "native:extractbyexpression",
# {"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"}
# )["OUTPUT"]
# summary["step7_sliver_removed"] = slivers.featureCount()
# print(" - Slivers found:", slivers.featureCount())
# no_sliver = processing.run(
# "native:extractbyexpression",
# {"INPUT": reprojected, "EXPRESSION": "$area >= 1", "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Features left after removing slivers:", no_sliver.featureCount())
# # ========================================================
# # 8. REMOVE TINY HOLES (< 1 m2)
# # ========================================================
# print("\nStep 8: Remove tiny holes")
# no_holes = processing.run(
# "native:deleteholes",
# {"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"}
# )["OUTPUT"]
# print(" - Features:", no_holes.featureCount())
# summary["step8_after_deleteholes"] = no_holes.featureCount()
# # ========================================================
# # FINISH (TRIM ATTRIBUTES)
# # ========================================================
# print("\nFinal Step: Trim string fields")
# trimmed = processing.run(
# "qgis:refactorfields",
# {
# "INPUT": no_holes,
# "FIELDS_MAPPING": [
# {
# "expression": f"trim(\"{field.name()}\")"
# if field.typeName().lower() in ["text", "varchar"]
# else f"\"{field.name()}\"",
# "name": field.name(),
# "type": field.type(),
# "length": field.length(),
# "precision": field.precision()
# }
# for field in no_holes.fields()
# ],
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# print(" - Final feature count:", trimmed.featureCount())
# print("========== CLEANSING DONE ==========\n")
# return {
# "summary": summary,
# "clean_layer": trimmed
# }
# self-intersection # self-intersection
def cleansing_layer(layer: QgsVectorLayer) -> Dict: def cleansing_layer(layer: QgsVectorLayer) -> Dict:
# ========================================================
# INITIAL STATE
# ========================================================
print("\n========== START CLEANSING ==========") print("\n========== START CLEANSING ==========")
print("Step 0: Load Layer") print("Step 0: Load Layer")
print(" - Valid:", layer.isValid()) print(" - Valid:", layer.isValid())
print(" - Feature Count:", layer.featureCount()) print(" - Feature Count:", layer.featureCount())
print(" - type:", layer.geometryType())
summary = { summary = {
"step0_features": layer.featureCount(), "features": layer.featureCount(),
"step1_invalid_before": 0, "invalid_before": 0,
"step1_5_self_intersections": 0, "after_fixgeometries": 0,
"step2_after_fix": 0, "after_fix": 0,
"step3_after_multipolygon": 0, "after_multipolygon": 0,
"step4_duplicates_removed": 0, "duplicates_removed": 0,
"step5_after_remove_vertices": 0, "after_remove_vertices": 0,
"step6_after_srid": 0, "after_srid": 0,
"step7_sliver_removed": 0, "sliver_removed": 0,
"step8_after_deleteholes": 0 "after_deleteholes": 0,
"valid_after": 0
} }
# ======================================================== # 1. Geometry validity check
# 1. VALIDATE GEOMETRY print("\nStep 1: Geometry validity check (QGIS native)")
# ======================================================== validity = processing.run(
print("\nStep 1: Identify invalid geometries") "qgis:checkvalidity",
{
"INPUT_LAYER": layer,
"METHOD": 2, # GEOS
"IGNORE_RING_SELF_INTERSECTION": False,
"VALID_OUTPUT": "memory:",
"INVALID_OUTPUT": "memory:",
"ERROR_OUTPUT": "memory:"
}
)
invalid_layer = validity["INVALID_OUTPUT"]
error_table = validity["ERROR_OUTPUT"]
invalid_count = invalid_layer.featureCount()
summary["invalid_before"] = invalid_count
print(" - Invalid geometries found:", invalid_count)
print(" - Total error messages:", error_table.featureCount())
invalid_ids = [] # 1.1 Fix invalid geometries
for f in layer.getFeatures(): # print("\nStep 1.1: Fix invalid geometries (FixGeometries)")
if not f.geometry().isGeosValid(): # fixed_pre = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
invalid_ids.append(f.id()) # summary["after_fixgeometries"] = fixed_pre.featureCount()
# print(" - Features after FixGeometries:", fixed_pre.featureCount())
# layer = fixed_pre
summary["step1_invalid_before"] = len(invalid_ids) # 2. Fix geometries (again)
print(" - Invalid geometries found:", len(invalid_ids))
# ========================================================
# 1.5 DETECT GEOMETRY ERRORS (MANUAL GEOS VALIDATION)
# ========================================================
print("\nStep 1.5: Detect geometry errors (universal GEOS-safe method)")
errors = []
for f in layer.getFeatures():
geom = f.geometry()
if not geom.isGeosValid():
# Kita hanya tandai invalid (tanpa reason)
errors.append(f.id())
summary["step1_5_geometry_errors"] = len(errors)
print(" - Geometry errors detected:", len(errors))
print(" - Invalid feature IDs (first 10):", errors[:10])
# ========================================================
# 1.6 FIX INVALID GEOMETRIES (Native FixGeometries)
# ========================================================
print("\nStep 1.6: Fix invalid geometries (FixGeometries)")
fixed_pre = processing.run(
"native:fixgeometries",
{"INPUT": layer, "OUTPUT": "memory:"}
)["OUTPUT"]
summary["step1_6_after_fixgeometries"] = fixed_pre.featureCount()
print(" - Features after FixGeometries:", fixed_pre.featureCount())
layer = fixed_pre
# ========================================================
# 2. FIX GEOMETRIES (INCLUDES SELF-INTERSECTION FIX)
# ========================================================
print("\nStep 2: Fix geometries (including self-intersections)") print("\nStep 2: Fix geometries (including self-intersections)")
fixed = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
fixed = processing.run(
"native:fixgeometries",
{"INPUT": layer, "OUTPUT": "memory:"}
)["OUTPUT"]
print(" - Valid after fix:", fixed.isValid()) print(" - Valid after fix:", fixed.isValid())
print(" - Features after fix:", fixed.featureCount()) print(" - Features after fix:", fixed.featureCount())
summary["step2_after_fix"] = fixed.featureCount() summary["after_fix"] = fixed.featureCount()
# ======================================================== # ========================================================
# 3. ENSURE MULTIPOLYGON # 3. ENSURE MULTIPOLYGON (LTR compatible!!)
# ======================================================== # ========================================================
print("\nStep 3: Ensure MULTIPOLYGON") print("\nStep 3: Ensure MULTIPOLYGON (LTR-safe method)")
multipolygon = processing.run( # Step 3.1: Pecah multiparts → single (agar bersih)
"native:collect", singleparts = processing.run(
"native:multiparttosingleparts",
{"INPUT": fixed, "OUTPUT": "memory:"} {"INPUT": fixed, "OUTPUT": "memory:"}
)["OUTPUT"] )["OUTPUT"]
print(" - After multiparttosingleparts:", singleparts.featureCount())
# Step 3.2: Promote semua polygon → multipolygon
multipolygon = processing.run(
"native:promotetomulti",
{"INPUT": singleparts, "OUTPUT": "memory:"}
)["OUTPUT"]
print(" - After promotetomulti:", multipolygon.featureCount())
print(" - Valid:", multipolygon.isValid()) print(" - Valid:", multipolygon.isValid())
print(" - Features:", multipolygon.featureCount())
summary["step3_after_multipolygon"] = multipolygon.featureCount()
# ======================================================== summary["after_multipolygon"] = multipolygon.featureCount()
# 4. REMOVE DUPLICATE ROWS
# ========================================================
# 4. Remove duplicate rows
print("\nStep 4: Remove duplicate rows") print("\nStep 4: Remove duplicate rows")
all_fields = [f.name() for f in multipolygon.fields()] all_fields = [f.name() for f in multipolygon.fields()]
print(" - All fields:", all_fields) print(" - All fields:", all_fields)
if "id" in all_fields: if "id" in all_fields:
key_fields = ["id"] key_fields = ["id"]
else: else:
int_cols = [ int_cols = [f.name() for f in multipolygon.fields() if f.typeName().lower() in ["int", "integer", "bigint"]]
f.name() for f in multipolygon.fields()
if f.typeName().lower() in ["int", "integer", "bigint"]
]
key_fields = [int_cols[0]] if int_cols else all_fields key_fields = [int_cols[0]] if int_cols else all_fields
print(" - Using duplicate key:", key_fields) print(" - Using duplicate key:", key_fields)
dedup = processing.run("native:removeduplicatesbyattribute", {"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"})["OUTPUT"]
dedup = processing.run(
"native:removeduplicatesbyattribute",
{"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"}
)["OUTPUT"]
duplicates_removed = multipolygon.featureCount() - dedup.featureCount() duplicates_removed = multipolygon.featureCount() - dedup.featureCount()
summary["step4_duplicates_removed"] = duplicates_removed summary["duplicates_removed"] = duplicates_removed
print(" - Features before:", multipolygon.featureCount()) print(" - Features before:", multipolygon.featureCount())
print(" - Features after:", dedup.featureCount()) print(" - Features after:", dedup.featureCount())
print(" - Duplicates removed:", duplicates_removed) print(" - Duplicates removed:", duplicates_removed)
# ======================================================== # 5. Remove duplicate vertices
# 5. REMOVE DUPLICATE VERTICES
# ========================================================
print("\nStep 5: Remove duplicate vertices") print("\nStep 5: Remove duplicate vertices")
no_dup_vertices = processing.run("native:removeduplicatevertices", {"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"})["OUTPUT"]
no_dup_vertices = processing.run(
"native:removeduplicatevertices",
{"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"}
)["OUTPUT"]
print(" - Features:", no_dup_vertices.featureCount()) print(" - Features:", no_dup_vertices.featureCount())
summary["step5_after_remove_vertices"] = no_dup_vertices.featureCount() summary["after_remove_vertices"] = no_dup_vertices.featureCount()
# ======================================================== print("\nStep 5.5: Check input CRS before reprojection")
# 6. FIX SRID / REPROJECT input_crs = no_dup_vertices.crs()
# ======================================================== if input_crs.isValid():
print("\nStep 6: Reproject (Fix SRID to EPSG:4326)") print(" - Input CRS:", input_crs.authid())
print(" - CRS description:", input_crs.description())
else:
print(" - CRS INVALID or UNDEFINED")
reprojected = processing.run( # 6. REPROJECT to metric CRS BEFORE any area-based ops (use EPSG:4326 or local UTM)
"native:reprojectlayer", print("\nStep 6: Reproject layer to EPSG:4326 for metric area calculations")
{"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"} # choose EPSG:4326 or better choose local UTM if you know it; EPSG:4326 is general metric
)["OUTPUT"] final_proj = processing.run("native:reprojectlayer", {"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"})["OUTPUT"]
print(" - Features after reproject:", final_proj.featureCount())
summary["after_srid"] = final_proj.featureCount()
print(" - Features:", reprojected.featureCount())
summary["step6_after_srid"] = reprojected.featureCount()
# ======================================================== # 7. Remove sliver polygons based on metric area (< 1 m^2)
# 7. REMOVE SLIVER POLYGONS (< 1 m2) # print("\nStep 7: Remove sliver polygons (<1 m²)")
# ======================================================== # # use $area now because layer is in meters (EPSG:3857)
print("\nStep 7: Remove sliver polygons (<1 m²)") # slivers = processing.run("native:extractbyexpression", {"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"})["OUTPUT"]
# summary["sliver_removed"] = slivers.featureCount()
# print(" - Slivers found:", slivers.featureCount())
# no_sliver = processing.run(
# "native:extractbyexpression",
# {
# "INPUT": reprojected,
# "EXPRESSION": "geometry IS NOT NULL AND $area >= 1",
# "OUTPUT": "memory:"
# }
# )["OUTPUT"]
# print(" - Features left after removing slivers:", no_sliver.featureCount())
slivers = processing.run( # # 8. Remove tiny holes (<1 m^2) — still in metric CRS
"native:extractbyexpression", # print("\nStep 8: Remove tiny holes (<1 m²)")
{"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"} # no_holes = processing.run("native:deleteholes", {"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"})["OUTPUT"]
)["OUTPUT"] # print(" - Features after delete holes:", no_holes.featureCount())
# summary["after_deleteholes"] = no_holes.featureCount()
summary["step7_sliver_removed"] = slivers.featureCount() # # Reproject BACK to EPSG:4326 for downstream (GeoServer/PostGIS target)
print(" - Slivers found:", slivers.featureCount()) # print("\nStep 9: Reproject back to EPSG:4326")
# final_proj = processing.run("native:reprojectlayer", {"INPUT": no_holes, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"})["OUTPUT"]
# print(" - Features:", final_proj.featureCount())
no_sliver = processing.run( # Final: Trim string fields
"native:extractbyexpression",
{"INPUT": reprojected, "EXPRESSION": "$area >= 1", "OUTPUT": "memory:"}
)["OUTPUT"]
print(" - Features left after removing slivers:", no_sliver.featureCount())
# ========================================================
# 8. REMOVE TINY HOLES (< 1 m2)
# ========================================================
print("\nStep 8: Remove tiny holes")
no_holes = processing.run(
"native:deleteholes",
{"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"}
)["OUTPUT"]
print(" - Features:", no_holes.featureCount())
summary["step8_after_deleteholes"] = no_holes.featureCount()
# ========================================================
# FINAL: TRIM STRING FIELDS
# ========================================================
print("\nFinal Step: Trim string fields") print("\nFinal Step: Trim string fields")
trimmed = processing.run( trimmed = processing.run(
"qgis:refactorfields", "qgis:refactorfields",
{ {
"INPUT": no_holes, "INPUT": final_proj,
"FIELDS_MAPPING": [ "FIELDS_MAPPING": [
{ {
"expression": f"trim(\"{field.name()}\")" "expression": f"trim(\"{field.name()}\")"
@ -603,19 +186,98 @@ def cleansing_layer(layer: QgsVectorLayer) -> Dict:
"length": field.length(), "length": field.length(),
"precision": field.precision() "precision": field.precision()
} }
for field in no_holes.fields() for field in final_proj.fields()
], ],
"KEEP_GEOMETRY": True, # <--- WAJIB
"OUTPUT": "memory:" "OUTPUT": "memory:"
} }
)["OUTPUT"] )["OUTPUT"]
valid_after = 0
for f in trimmed.getFeatures():
if f.geometry() is not None and f.geometry().isGeosValid():
valid_after += 1
summary["valid_after"] = valid_after
print(" - Final feature count:", trimmed.featureCount()) print(" - Final feature count:", trimmed.featureCount())
print("========== CLEANSING DONE ==========\n") print("========== CLEANSING DONE ==========\n")
return { return {"summary": summary, "clean_layer": trimmed}
"summary": summary,
"clean_layer": trimmed
def cleansing_points(layer: QgsVectorLayer):
print("\n=== POINT CLEANING PIPELINE ===")
summary = {
"features_before": layer.featureCount(),
"invalid_before": 0,
"after_fix": 0,
"after_dedup": 0,
"after_reproject": 0,
"valid_after": 0
} }
# 1. Check validity (will always return 0 errors for points)
validity = processing.run(
"qgis:checkvalidity",
{"INPUT_LAYER": layer, "METHOD": 2, "VALID_OUTPUT": "memory:", "INVALID_OUTPUT": "memory:", "ERROR_OUTPUT": "memory:"}
)
invalid = validity["INVALID_OUTPUT"].featureCount()
summary["invalid_before"] = invalid
print("- Invalid points:", invalid)
# 2. Fix geometries (safe)
fixed = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
summary["after_fix"] = fixed.featureCount()
# 3. Remove duplicate coordinates (points only)
dedup = processing.run(
"native:removedduplicategeometries",
{"INPUT": fixed, "OUTPUT": "memory:"}
)["OUTPUT"]
summary["after_dedup"] = dedup.featureCount()
# 4. Reproject
reproject = processing.run(
"native:reprojectlayer",
{"INPUT": dedup, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"}
)["OUTPUT"]
summary["after_reproject"] = reproject.featureCount()
# 5. Trim string fields
trimmed = processing.run(
"qgis:refactorfields",
{
"INPUT": reproject,
"FIELDS_MAPPING": [
{
"expression": f"trim(\"{field.name()}\")" if field.typeName().lower() in ["text","varchar"]
else f"\"{field.name()}\"",
"name": field.name(),
"type": field.type(),
"length": field.length(),
"precision": field.precision(),
}
for field in reproject.fields()
],
"KEEP_GEOMETRY": True,
"OUTPUT": "memory:"
}
)["OUTPUT"]
# 6. Validity check for points (simple)
valid_after = 0
for f in trimmed.getFeatures():
if f.geometry() is not None:
valid_after += 1
summary["valid_after"] = valid_after
return {"summary": summary, "clean_layer": trimmed}

160
main.py
View File

@ -1,8 +1,20 @@
from fastapi import FastAPI, BackgroundTasks from fastapi import FastAPI, BackgroundTasks
from qgis_bootstrap import start_qgis import psycopg2
import requests
from uuid import uuid4 from uuid import uuid4
from qgis_bootstrap import start_qgis
# from cleansing_service import load_layer, cleansing_layer # from cleansing_service import load_layer, cleansing_layer
from full_cleansing_service import load_layer, cleansing_layer from full_cleansing_service import load_layer, cleansing_layer
from qgis.core import (
QgsVectorLayer,
QgsVectorLayerExporter,
QgsDataSourceUri,
QgsProviderRegistry,
QgsCoordinateReferenceSystem
)
from qgis.PyQt.QtCore import QByteArray
from core.config import HOST,PORT,DB,USER,PWD,SCHEMA,GEOM_COL
app = FastAPI() app = FastAPI()
@ -67,70 +79,116 @@ def run_clean_table(table_name: str, job_id: str):
"status": "FINISHED" "status": "FINISHED"
} }
import requests
requests.post( requests.post(
"http://backend-utama:8000/jobs/callback", "http://localhost:8000/jobs/callback",
json=callback_payload json=callback_payload
) )
print(f"=== Cleansing selesai untuk tabel: {table_name} ===\n") print(f"=== Cleansing selesai untuk tabel: {table_name} ===\n")
def to_python(v):
# Null
if v is None:
return None
# QVariant kosong
if hasattr(v, "isNull") and v.isNull():
return None
# Convert QVariant to Python native
if hasattr(v, "toPyObject"):
return v.toPyObject()
# Fallback
return v
def save_to_postgis(layer, table_name):
host = HOST
port = PORT
db = DB
user = USER
pwd = PWD
schema = SCHEMA
geom_col = GEOM_COL
srid = layer.crs().postgisSrid()
fields = layer.fields()
# CONNECT
conn = psycopg2.connect(
from qgis.core import ( dbname=db,
QgsVectorLayer, host=host,
QgsVectorLayerExporter, port=port,
QgsDataSourceUri user=user,
password=pwd
) )
from database import POSTGIS cur = conn.cursor()
# DROP TABLE
cur.execute(f'DROP TABLE IF EXISTS "{schema}"."{table_name}" CASCADE')
def save_to_postgis(clean_layer: QgsVectorLayer, table_name: str): # CREATE TABLE
""" field_defs = []
Menghapus isi tabel dan menulis ulang hasil cleansing ke PostGIS. for f in fields:
Geometry harus MULTIPOLYGON dan SRID sudah benar. if f.name() == geom_col:
""" continue
print(f"[DB] Menyimpan hasil cleansing ke tabel {table_name}") # type mapping
t = f.typeName().lower()
# ------------------------------------------- if "int" in t:
# 1. Build URI PostGIS target pg_type = "INTEGER"
# ------------------------------------------- elif "double" in t or "float" in t or "real" in t:
uri = QgsDataSourceUri() pg_type = "DOUBLE PRECISION"
uri.setConnection(
POSTGIS['host'],
str(POSTGIS['port']),
POSTGIS['db'],
POSTGIS['user'],
POSTGIS['password']
)
# Nama schema & tabel
schema = "public"
uri.setDataSource(schema, table_name, "geom") # geometry column = geom
# -------------------------------------------
# 2. Export layer ke PostGIS (replace mode)
# -------------------------------------------
options = QgsVectorLayerExporter.ExportOptions()
options.actionOnExistingFile = QgsVectorLayerExporter.ActionOnExistingFile.OverwriteLayer
err_code, err_msg = QgsVectorLayerExporter.exportLayer(
clean_layer, # layer input
uri.uri(), # postgis connection uri
"postgres", # provider
clean_layer.crs(), # CRS layer
options
)
if err_code != QgsVectorLayerExporter.NoError:
print("[DB][ERROR] Gagal menyimpan:", err_msg)
else: else:
print("[DB] Berhasil update tabel", table_name) pg_type = "TEXT"
col = f.name().replace(" ", "_")
field_defs.append(f'"{col}" {pg_type}')
# geometry column
field_defs.append(f'"{geom_col}" geometry(MultiPolygon,{srid})')
create_sql = f'CREATE TABLE "{schema}"."{table_name}" ({",".join(field_defs)});'
cur.execute(create_sql)
# Prepare INSERT
attribute_columns = [
f'"{f.name().replace(" ", "_")}"'
for f in fields if f.name() != geom_col
]
insert_columns = attribute_columns + [f'"{geom_col}"']
placeholders = ["%s"] * len(insert_columns)
insert_sql = f"""
INSERT INTO "{schema}"."{table_name}"
({",".join(insert_columns)})
VALUES ({",".join(placeholders)})
"""
# INSERT ROWS
count = 0
for feat in layer.getFeatures():
attrs = feat.attributes()
row = []
for f, v in zip(fields, attrs):
if f.name() != geom_col:
row.append(to_python(v))
geom = feat.geometry()
wkb_bytes = geom.asWkb()
if isinstance(wkb_bytes, QByteArray):
wkb_bytes = bytes(wkb_bytes)
row.append(psycopg2.Binary(wkb_bytes))
cur.execute(insert_sql, row)
count += 1
conn.commit()
cur.close()
conn.close()
print(f"[DB] Inserted features: {count}")

View File

@ -17,7 +17,7 @@ os.environ["QT_QPA_PLATFORM"] = "offscreen"
sys.path.append(f"{QGIS_PREFIX}/python") sys.path.append(f"{QGIS_PREFIX}/python")
sys.path.append(f"{QGIS_PREFIX}/python/plugins") sys.path.append(f"{QGIS_PREFIX}/python/plugins")
from qgis.core import QgsApplication from qgis.core import QgsApplication, QgsProviderRegistry
from qgis.analysis import QgsNativeAlgorithms from qgis.analysis import QgsNativeAlgorithms
import processing import processing
@ -29,52 +29,7 @@ def start_qgis():
# === WAJIB: initialize processing === # === WAJIB: initialize processing ===
Processing.initialize() Processing.initialize()
QgsProviderRegistry.instance()
qgs.processingRegistry().addProvider(QgsNativeAlgorithms()) qgs.processingRegistry().addProvider(QgsNativeAlgorithms())
return qgs return qgs
# DEPLOYMENT
# import os
# import sys
# # QGIS environment
# os.environ["QGIS_PREFIX_PATH"] = "/usr"
# os.environ["QGIS_HOME"] = "/usr"
# os.environ["PROJ_LIB"] = "/usr/share/proj"
# os.environ["GDAL_DATA"] = "/usr/share/gdal"
# os.environ["QT_PLUGIN_PATH"] = "/usr/lib/x86_64-linux-gnu/qt5/plugins"
# os.environ["QT_QPA_PLATFORM"] = "offscreen"
# # QGIS Python plugins (THIS IS THE MISSING PART)
# sys.path.append("/usr/share/qgis/python")
# sys.path.append("/usr/share/qgis/python/plugins")
# # Python modules (from system)
# sys.path.append("/usr/lib/python3/dist-packages")
# sys.path.append("/usr/lib/python3/dist-packages/qgis")
# from qgis.core import QgsApplication
# from qgis.analysis import QgsNativeAlgorithms
# import processing
# from processing.core.Processing import Processing
# def start_qgis():
# qgs = QgsApplication([], False)
# qgs.initQgis()
# Processing.initialize()
# qgs.processingRegistry().addProvider(QgsNativeAlgorithms())
# return qgs