fixing cleansing data
This commit is contained in:
parent
421d0cf90b
commit
14852b6648
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,5 +1,8 @@
|
|||
.env
|
||||
test_pg.py
|
||||
cleansing_service.py
|
||||
postgis_metadata.py
|
||||
database.py
|
||||
|
||||
__pycache__/
|
||||
data/
|
||||
13
core/config.py
Normal file
13
core/config.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
HOST = os.getenv("host")
|
||||
PORT = os.getenv("port")
|
||||
DB = os.getenv("db")
|
||||
USER = os.getenv("user")
|
||||
PWD = os.getenv("pwd")
|
||||
SCHEMA = os.getenv("schema")
|
||||
GEOM_COL = os.getenv("geom_col")
|
||||
|
||||
21
database.py
21
database.py
|
|
@ -1,21 +0,0 @@
|
|||
POSTGIS = {
|
||||
"host": "192.168.60.24",
|
||||
"port": "5432",
|
||||
"db": "test_postgis",
|
||||
"user": "postgres",
|
||||
"password": "12345"
|
||||
}
|
||||
|
||||
def build_uri(table_name: str) -> str:
|
||||
return (
|
||||
f"dbname='{POSTGIS['db']}' "
|
||||
f"host='{POSTGIS['host']}' "
|
||||
f"port='{POSTGIS['port']}' "
|
||||
f"user='{POSTGIS['user']}' "
|
||||
f"password='{POSTGIS['password']}' "
|
||||
f"sslmode=disable "
|
||||
f"table=\"public\".\"{table_name}\" "
|
||||
f"key='_id'"
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -1,598 +1,181 @@
|
|||
from qgis.core import (
|
||||
QgsDataSourceUri,
|
||||
QgsFeature,
|
||||
QgsVectorLayer,
|
||||
QgsVectorLayerExporter,
|
||||
QgsVectorFileWriter
|
||||
QgsVectorFileWriter,
|
||||
QgsWkbTypes
|
||||
)
|
||||
import processing
|
||||
from typing import Dict
|
||||
from database import build_uri
|
||||
from core.config import HOST,PORT,DB,USER,PWD,SCHEMA,GEOM_COL
|
||||
|
||||
def load_layer(table_name: str):
|
||||
uri = build_uri(table_name)
|
||||
print('uri', uri)
|
||||
layer = QgsVectorLayer(uri, table_name, "postgres")
|
||||
uri = QgsDataSourceUri()
|
||||
uri.setConnection(HOST, PORT, DB, USER, PWD)
|
||||
uri.setDataSource(SCHEMA, table_name, GEOM_COL, "", "_id")
|
||||
|
||||
layer = QgsVectorLayer(uri.uri(), table_name, "postgres")
|
||||
|
||||
print("Layer valid:", layer.isValid())
|
||||
return layer
|
||||
|
||||
|
||||
# def cleansing_layer(layer: QgsVectorLayer) -> Dict:
|
||||
|
||||
# summary = {
|
||||
# "total_features_before": layer.featureCount(),
|
||||
# "invalid_geometries_before": 0,
|
||||
# "invalid_geometries_fixed": 0,
|
||||
# "duplicates_removed": 0,
|
||||
# "sliver_removed": 0,
|
||||
# "holes_removed": 0
|
||||
# }
|
||||
|
||||
# # ========================================================
|
||||
# # 1. IDENTIFY INVALID GEOMETRY
|
||||
# # ========================================================
|
||||
# invalid_ids = []
|
||||
# for f in layer.getFeatures():
|
||||
# if not f.geometry().isGeosValid():
|
||||
# invalid_ids.append(f.id())
|
||||
|
||||
# summary["invalid_geometries_before"] = len(invalid_ids)
|
||||
|
||||
# # ========================================================
|
||||
# # 2. FIX GEOMETRIES
|
||||
# # ========================================================
|
||||
# fixed = processing.run(
|
||||
# "native:fixgeometries",
|
||||
# {
|
||||
# "INPUT": layer,
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# summary["invalid_geometries_fixed"] = len(invalid_ids)
|
||||
|
||||
# # ========================================================
|
||||
# # 3. ENSURE MULTIPOLYGON
|
||||
# # ========================================================
|
||||
# multipolygon = processing.run(
|
||||
# "native:collect",
|
||||
# {
|
||||
# "INPUT": fixed,
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# # ========================================================
|
||||
# # 4. REMOVE DUPLICATE ROWS
|
||||
# # ========================================================
|
||||
# all_fields = [f.name() for f in multipolygon.fields()]
|
||||
# print("Detecting key fields:", all_fields)
|
||||
|
||||
# key_fields = None
|
||||
|
||||
# # (1) Prefer 'id'
|
||||
# if "id" in all_fields:
|
||||
# key_fields = ["id"]
|
||||
|
||||
# # (2) Else pick first integer field
|
||||
# if key_fields is None:
|
||||
# int_cols = [
|
||||
# f.name() for f in multipolygon.fields()
|
||||
# if f.typeName().lower() in ["int", "integer", "bigint"]
|
||||
# ]
|
||||
# if int_cols:
|
||||
# key_fields = [int_cols[0]]
|
||||
|
||||
# # (3) Else use all fields
|
||||
# if key_fields is None:
|
||||
# key_fields = all_fields
|
||||
|
||||
# print("Using key field:", key_fields)
|
||||
|
||||
# dedup = processing.run(
|
||||
# "native:removeduplicatesbyattribute",
|
||||
# {
|
||||
# "INPUT": multipolygon,
|
||||
# "FIELDS": key_fields,
|
||||
# "METHOD": 0,
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# summary["duplicates_removed"] = (
|
||||
# multipolygon.featureCount() - dedup.featureCount()
|
||||
# )
|
||||
|
||||
# # ========================================================
|
||||
# # 5. REMOVE DUPLICATE VERTICES
|
||||
# # ========================================================
|
||||
# no_dup_vertices = processing.run(
|
||||
# "native:removeduplicatevertices",
|
||||
# {
|
||||
# "INPUT": dedup,
|
||||
# "VERTICES": 0, # remove exact duplicates
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# # ========================================================
|
||||
# # 6. FIX SRID (REPROJECT IF NEEDED)
|
||||
# # ========================================================
|
||||
# # Force SRID to 4326
|
||||
# reprojected = processing.run(
|
||||
# "native:reprojectlayer",
|
||||
# {
|
||||
# "INPUT": no_dup_vertices,
|
||||
# "TARGET_CRS": "EPSG:4326",
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# # ========================================================
|
||||
# # 7. REMOVE SLIVER POLYGONS (< 1 m²)
|
||||
# # ========================================================
|
||||
# # Filter polygons with area < 1 (threshold bisa kamu ubah)
|
||||
# slivers = processing.run(
|
||||
# "native:extractbyexpression",
|
||||
# {
|
||||
# "INPUT": reprojected,
|
||||
# "EXPRESSION": "$area < 1",
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# summary["sliver_removed"] = slivers.featureCount()
|
||||
|
||||
# # Keep only polygons with area >= 1
|
||||
# no_sliver = processing.run(
|
||||
# "native:extractbyexpression",
|
||||
# {
|
||||
# "INPUT": reprojected,
|
||||
# "EXPRESSION": "$area >= 1",
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# # ========================================================
|
||||
# # 8. REMOVE TINY HOLES (< 1 m²)
|
||||
# # ========================================================
|
||||
# no_holes = processing.run(
|
||||
# "native:deleteholes",
|
||||
# {
|
||||
# "INPUT": no_sliver,
|
||||
# "MIN_AREA": 1, # minimum area of hole to keep
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# summary["holes_removed"] = 0 # can't count holes easily in PyQGIS
|
||||
|
||||
|
||||
# # ========================================================
|
||||
# # 9. TRIM STRING FIELDS (ATTRIBUTE CLEANSING)
|
||||
# # ========================================================
|
||||
# trimmed = processing.run(
|
||||
# "qgis:refactorfields",
|
||||
# {
|
||||
# "INPUT": no_holes,
|
||||
# "FIELDS_MAPPING": [
|
||||
# {
|
||||
# "expression": f"trim(\"{field.name()}\")"
|
||||
# if field.typeName().lower() in ["text", "varchar"]
|
||||
# else f"\"{field.name()}\"",
|
||||
# "name": field.name(),
|
||||
# "type": field.type(),
|
||||
# "length": field.length(),
|
||||
# "precision": field.precision()
|
||||
# }
|
||||
# for field in no_holes.fields()
|
||||
# ],
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# # ========================================================
|
||||
# # RETURN CLEANED LAYER
|
||||
# # ========================================================
|
||||
# return {
|
||||
# "summary": summary,
|
||||
# "clean_layer": trimmed
|
||||
# }
|
||||
|
||||
|
||||
# def cleansing_layer(layer: QgsVectorLayer) -> Dict:
|
||||
|
||||
# # ========================================================
|
||||
# # INITIAL STATE
|
||||
# # ========================================================
|
||||
# print("\n========== START CLEANSING ==========")
|
||||
# print("Step 0: Load Layer")
|
||||
# print(" - Valid:", layer.isValid())
|
||||
# print(" - Feature Count:", layer.featureCount())
|
||||
|
||||
# summary = {
|
||||
# "step0_features": layer.featureCount(),
|
||||
# "step1_invalid_before": 0,
|
||||
# "step2_after_fix": 0,
|
||||
# "step3_after_multipolygon": 0,
|
||||
# "step4_duplicates_removed": 0,
|
||||
# "step5_after_remove_vertices": 0,
|
||||
# "step6_after_srid": 0,
|
||||
# "step7_sliver_removed": 0,
|
||||
# "step8_after_deleteholes": 0
|
||||
# }
|
||||
|
||||
# # ========================================================
|
||||
# # 1. VALIDATE GEOMETRY
|
||||
# # ========================================================
|
||||
# print("\nStep 1: Identify invalid geometries")
|
||||
|
||||
# invalid_ids = []
|
||||
# for f in layer.getFeatures():
|
||||
# if not f.geometry().isGeosValid():
|
||||
# invalid_ids.append(f.id())
|
||||
|
||||
# summary["step1_invalid_before"] = len(invalid_ids)
|
||||
|
||||
# print(" - Invalid geometries found:", len(invalid_ids))
|
||||
|
||||
# # ========================================================
|
||||
# # 2. FIX GEOMETRIES
|
||||
# # ========================================================
|
||||
# print("\nStep 2: Fix geometries")
|
||||
# fixed = processing.run(
|
||||
# "native:fixgeometries",
|
||||
# {"INPUT": layer, "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Valid:", fixed.isValid())
|
||||
# print(" - Features after fix:", fixed.featureCount())
|
||||
# summary["step2_after_fix"] = fixed.featureCount()
|
||||
|
||||
# # ========================================================
|
||||
# # 3. ENSURE MULTIPOLYGON
|
||||
# # ========================================================
|
||||
# print("\nStep 3: Ensure MULTIPOLYGON")
|
||||
# multipolygon = processing.run(
|
||||
# "native:collect",
|
||||
# {"INPUT": fixed, "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Valid:", multipolygon.isValid())
|
||||
# print(" - Features:", multipolygon.featureCount())
|
||||
# summary["step3_after_multipolygon"] = multipolygon.featureCount()
|
||||
|
||||
# # ========================================================
|
||||
# # 4. REMOVE DUPLICATE ROWS
|
||||
# # ========================================================
|
||||
# print("\nStep 4: Remove duplicate rows")
|
||||
|
||||
# all_fields = [f.name() for f in multipolygon.fields()]
|
||||
# print(" - All fields:", all_fields)
|
||||
|
||||
# key_fields = None
|
||||
|
||||
# if "id" in all_fields:
|
||||
# key_fields = ["id"]
|
||||
# else:
|
||||
# int_cols = [
|
||||
# f.name() for f in multipolygon.fields()
|
||||
# if f.typeName().lower() in ["int", "integer", "bigint"]
|
||||
# ]
|
||||
# if int_cols:
|
||||
# key_fields = [int_cols[0]]
|
||||
# else:
|
||||
# key_fields = all_fields
|
||||
|
||||
# print(" - Using duplicate key:", key_fields)
|
||||
|
||||
# dedup = processing.run(
|
||||
# "native:removeduplicatesbyattribute",
|
||||
# {"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# duplicates_removed = multipolygon.featureCount() - dedup.featureCount()
|
||||
# summary["step4_duplicates_removed"] = duplicates_removed
|
||||
|
||||
# print(" - Features before:", multipolygon.featureCount())
|
||||
# print(" - Features after:", dedup.featureCount())
|
||||
# print(" - Duplicates removed:", duplicates_removed)
|
||||
|
||||
# # ========================================================
|
||||
# # 5. REMOVE DUPLICATE VERTICES
|
||||
# # ========================================================
|
||||
# print("\nStep 5: Remove duplicate vertices")
|
||||
|
||||
# no_dup_vertices = processing.run(
|
||||
# "native:removeduplicatevertices",
|
||||
# {"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Features:", no_dup_vertices.featureCount())
|
||||
# summary["step5_after_remove_vertices"] = no_dup_vertices.featureCount()
|
||||
|
||||
# # ========================================================
|
||||
# # 6. FIX SRID / REPROJECT
|
||||
# # ========================================================
|
||||
# print("\nStep 6: Reproject (Fix SRID to EPSG:4326)")
|
||||
|
||||
# reprojected = processing.run(
|
||||
# "native:reprojectlayer",
|
||||
# {"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Features:", reprojected.featureCount())
|
||||
# summary["step6_after_srid"] = reprojected.featureCount()
|
||||
|
||||
# # ========================================================
|
||||
# # 7. REMOVE SLIVER POLYGONS (< 1 m2)
|
||||
# # ========================================================
|
||||
# print("\nStep 7: Remove sliver polygons (<1 m²)")
|
||||
|
||||
# slivers = processing.run(
|
||||
# "native:extractbyexpression",
|
||||
# {"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# summary["step7_sliver_removed"] = slivers.featureCount()
|
||||
# print(" - Slivers found:", slivers.featureCount())
|
||||
|
||||
# no_sliver = processing.run(
|
||||
# "native:extractbyexpression",
|
||||
# {"INPUT": reprojected, "EXPRESSION": "$area >= 1", "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Features left after removing slivers:", no_sliver.featureCount())
|
||||
|
||||
# # ========================================================
|
||||
# # 8. REMOVE TINY HOLES (< 1 m2)
|
||||
# # ========================================================
|
||||
# print("\nStep 8: Remove tiny holes")
|
||||
|
||||
# no_holes = processing.run(
|
||||
# "native:deleteholes",
|
||||
# {"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"}
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Features:", no_holes.featureCount())
|
||||
# summary["step8_after_deleteholes"] = no_holes.featureCount()
|
||||
|
||||
# # ========================================================
|
||||
# # FINISH (TRIM ATTRIBUTES)
|
||||
# # ========================================================
|
||||
# print("\nFinal Step: Trim string fields")
|
||||
|
||||
# trimmed = processing.run(
|
||||
# "qgis:refactorfields",
|
||||
# {
|
||||
# "INPUT": no_holes,
|
||||
# "FIELDS_MAPPING": [
|
||||
# {
|
||||
# "expression": f"trim(\"{field.name()}\")"
|
||||
# if field.typeName().lower() in ["text", "varchar"]
|
||||
# else f"\"{field.name()}\"",
|
||||
# "name": field.name(),
|
||||
# "type": field.type(),
|
||||
# "length": field.length(),
|
||||
# "precision": field.precision()
|
||||
# }
|
||||
# for field in no_holes.fields()
|
||||
# ],
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
|
||||
# print(" - Final feature count:", trimmed.featureCount())
|
||||
# print("========== CLEANSING DONE ==========\n")
|
||||
|
||||
# return {
|
||||
# "summary": summary,
|
||||
# "clean_layer": trimmed
|
||||
# }
|
||||
|
||||
|
||||
|
||||
# self-intersection
|
||||
def cleansing_layer(layer: QgsVectorLayer) -> Dict:
|
||||
|
||||
# ========================================================
|
||||
# INITIAL STATE
|
||||
# ========================================================
|
||||
print("\n========== START CLEANSING ==========")
|
||||
print("Step 0: Load Layer")
|
||||
print(" - Valid:", layer.isValid())
|
||||
print(" - Feature Count:", layer.featureCount())
|
||||
print(" - type:", layer.geometryType())
|
||||
|
||||
summary = {
|
||||
"step0_features": layer.featureCount(),
|
||||
"step1_invalid_before": 0,
|
||||
"step1_5_self_intersections": 0,
|
||||
"step2_after_fix": 0,
|
||||
"step3_after_multipolygon": 0,
|
||||
"step4_duplicates_removed": 0,
|
||||
"step5_after_remove_vertices": 0,
|
||||
"step6_after_srid": 0,
|
||||
"step7_sliver_removed": 0,
|
||||
"step8_after_deleteholes": 0
|
||||
"features": layer.featureCount(),
|
||||
"invalid_before": 0,
|
||||
"after_fixgeometries": 0,
|
||||
"after_fix": 0,
|
||||
"after_multipolygon": 0,
|
||||
"duplicates_removed": 0,
|
||||
"after_remove_vertices": 0,
|
||||
"after_srid": 0,
|
||||
"sliver_removed": 0,
|
||||
"after_deleteholes": 0,
|
||||
"valid_after": 0
|
||||
}
|
||||
|
||||
# ========================================================
|
||||
# 1. VALIDATE GEOMETRY
|
||||
# ========================================================
|
||||
print("\nStep 1: Identify invalid geometries")
|
||||
# 1. Geometry validity check
|
||||
print("\nStep 1: Geometry validity check (QGIS native)")
|
||||
validity = processing.run(
|
||||
"qgis:checkvalidity",
|
||||
{
|
||||
"INPUT_LAYER": layer,
|
||||
"METHOD": 2, # GEOS
|
||||
"IGNORE_RING_SELF_INTERSECTION": False,
|
||||
"VALID_OUTPUT": "memory:",
|
||||
"INVALID_OUTPUT": "memory:",
|
||||
"ERROR_OUTPUT": "memory:"
|
||||
}
|
||||
)
|
||||
invalid_layer = validity["INVALID_OUTPUT"]
|
||||
error_table = validity["ERROR_OUTPUT"]
|
||||
invalid_count = invalid_layer.featureCount()
|
||||
summary["invalid_before"] = invalid_count
|
||||
print(" - Invalid geometries found:", invalid_count)
|
||||
print(" - Total error messages:", error_table.featureCount())
|
||||
|
||||
invalid_ids = []
|
||||
for f in layer.getFeatures():
|
||||
if not f.geometry().isGeosValid():
|
||||
invalid_ids.append(f.id())
|
||||
# 1.1 Fix invalid geometries
|
||||
# print("\nStep 1.1: Fix invalid geometries (FixGeometries)")
|
||||
# fixed_pre = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
# summary["after_fixgeometries"] = fixed_pre.featureCount()
|
||||
# print(" - Features after FixGeometries:", fixed_pre.featureCount())
|
||||
# layer = fixed_pre
|
||||
|
||||
summary["step1_invalid_before"] = len(invalid_ids)
|
||||
print(" - Invalid geometries found:", len(invalid_ids))
|
||||
|
||||
# ========================================================
|
||||
# 1.5 DETECT GEOMETRY ERRORS (MANUAL GEOS VALIDATION)
|
||||
# ========================================================
|
||||
print("\nStep 1.5: Detect geometry errors (universal GEOS-safe method)")
|
||||
|
||||
errors = []
|
||||
|
||||
for f in layer.getFeatures():
|
||||
geom = f.geometry()
|
||||
if not geom.isGeosValid():
|
||||
# Kita hanya tandai invalid (tanpa reason)
|
||||
errors.append(f.id())
|
||||
|
||||
summary["step1_5_geometry_errors"] = len(errors)
|
||||
|
||||
print(" - Geometry errors detected:", len(errors))
|
||||
print(" - Invalid feature IDs (first 10):", errors[:10])
|
||||
|
||||
|
||||
|
||||
# ========================================================
|
||||
# 1.6 FIX INVALID GEOMETRIES (Native FixGeometries)
|
||||
# ========================================================
|
||||
print("\nStep 1.6: Fix invalid geometries (FixGeometries)")
|
||||
|
||||
fixed_pre = processing.run(
|
||||
"native:fixgeometries",
|
||||
{"INPUT": layer, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
summary["step1_6_after_fixgeometries"] = fixed_pre.featureCount()
|
||||
|
||||
print(" - Features after FixGeometries:", fixed_pre.featureCount())
|
||||
|
||||
layer = fixed_pre
|
||||
|
||||
|
||||
|
||||
# ========================================================
|
||||
# 2. FIX GEOMETRIES (INCLUDES SELF-INTERSECTION FIX)
|
||||
# ========================================================
|
||||
# 2. Fix geometries (again)
|
||||
print("\nStep 2: Fix geometries (including self-intersections)")
|
||||
|
||||
fixed = processing.run(
|
||||
"native:fixgeometries",
|
||||
{"INPUT": layer, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
fixed = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
print(" - Valid after fix:", fixed.isValid())
|
||||
print(" - Features after fix:", fixed.featureCount())
|
||||
summary["step2_after_fix"] = fixed.featureCount()
|
||||
summary["after_fix"] = fixed.featureCount()
|
||||
|
||||
# ========================================================
|
||||
# 3. ENSURE MULTIPOLYGON
|
||||
# 3. ENSURE MULTIPOLYGON (LTR compatible!!)
|
||||
# ========================================================
|
||||
print("\nStep 3: Ensure MULTIPOLYGON")
|
||||
print("\nStep 3: Ensure MULTIPOLYGON (LTR-safe method)")
|
||||
|
||||
multipolygon = processing.run(
|
||||
"native:collect",
|
||||
# Step 3.1: Pecah multiparts → single (agar bersih)
|
||||
singleparts = processing.run(
|
||||
"native:multiparttosingleparts",
|
||||
{"INPUT": fixed, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
print(" - After multiparttosingleparts:", singleparts.featureCount())
|
||||
|
||||
# Step 3.2: Promote semua polygon → multipolygon
|
||||
multipolygon = processing.run(
|
||||
"native:promotetomulti",
|
||||
{"INPUT": singleparts, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
print(" - After promotetomulti:", multipolygon.featureCount())
|
||||
print(" - Valid:", multipolygon.isValid())
|
||||
print(" - Features:", multipolygon.featureCount())
|
||||
summary["step3_after_multipolygon"] = multipolygon.featureCount()
|
||||
|
||||
# ========================================================
|
||||
# 4. REMOVE DUPLICATE ROWS
|
||||
# ========================================================
|
||||
summary["after_multipolygon"] = multipolygon.featureCount()
|
||||
|
||||
|
||||
# 4. Remove duplicate rows
|
||||
print("\nStep 4: Remove duplicate rows")
|
||||
|
||||
all_fields = [f.name() for f in multipolygon.fields()]
|
||||
print(" - All fields:", all_fields)
|
||||
|
||||
if "id" in all_fields:
|
||||
key_fields = ["id"]
|
||||
else:
|
||||
int_cols = [
|
||||
f.name() for f in multipolygon.fields()
|
||||
if f.typeName().lower() in ["int", "integer", "bigint"]
|
||||
]
|
||||
int_cols = [f.name() for f in multipolygon.fields() if f.typeName().lower() in ["int", "integer", "bigint"]]
|
||||
key_fields = [int_cols[0]] if int_cols else all_fields
|
||||
|
||||
print(" - Using duplicate key:", key_fields)
|
||||
|
||||
dedup = processing.run(
|
||||
"native:removeduplicatesbyattribute",
|
||||
{"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
dedup = processing.run("native:removeduplicatesbyattribute", {"INPUT": multipolygon, "FIELDS": key_fields, "METHOD": 0, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
duplicates_removed = multipolygon.featureCount() - dedup.featureCount()
|
||||
summary["step4_duplicates_removed"] = duplicates_removed
|
||||
|
||||
summary["duplicates_removed"] = duplicates_removed
|
||||
print(" - Features before:", multipolygon.featureCount())
|
||||
print(" - Features after:", dedup.featureCount())
|
||||
print(" - Duplicates removed:", duplicates_removed)
|
||||
|
||||
# ========================================================
|
||||
# 5. REMOVE DUPLICATE VERTICES
|
||||
# ========================================================
|
||||
# 5. Remove duplicate vertices
|
||||
print("\nStep 5: Remove duplicate vertices")
|
||||
|
||||
no_dup_vertices = processing.run(
|
||||
"native:removeduplicatevertices",
|
||||
{"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
no_dup_vertices = processing.run("native:removeduplicatevertices", {"INPUT": dedup, "VERTICES": 0, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
print(" - Features:", no_dup_vertices.featureCount())
|
||||
summary["step5_after_remove_vertices"] = no_dup_vertices.featureCount()
|
||||
summary["after_remove_vertices"] = no_dup_vertices.featureCount()
|
||||
|
||||
# ========================================================
|
||||
# 6. FIX SRID / REPROJECT
|
||||
# ========================================================
|
||||
print("\nStep 6: Reproject (Fix SRID to EPSG:4326)")
|
||||
print("\nStep 5.5: Check input CRS before reprojection")
|
||||
input_crs = no_dup_vertices.crs()
|
||||
if input_crs.isValid():
|
||||
print(" - Input CRS:", input_crs.authid())
|
||||
print(" - CRS description:", input_crs.description())
|
||||
else:
|
||||
print(" - CRS INVALID or UNDEFINED")
|
||||
|
||||
reprojected = processing.run(
|
||||
"native:reprojectlayer",
|
||||
{"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
# 6. REPROJECT to metric CRS BEFORE any area-based ops (use EPSG:4326 or local UTM)
|
||||
print("\nStep 6: Reproject layer to EPSG:4326 for metric area calculations")
|
||||
# choose EPSG:4326 or better choose local UTM if you know it; EPSG:4326 is general metric
|
||||
final_proj = processing.run("native:reprojectlayer", {"INPUT": no_dup_vertices, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"})["OUTPUT"]
|
||||
print(" - Features after reproject:", final_proj.featureCount())
|
||||
summary["after_srid"] = final_proj.featureCount()
|
||||
|
||||
print(" - Features:", reprojected.featureCount())
|
||||
summary["step6_after_srid"] = reprojected.featureCount()
|
||||
|
||||
# ========================================================
|
||||
# 7. REMOVE SLIVER POLYGONS (< 1 m2)
|
||||
# ========================================================
|
||||
print("\nStep 7: Remove sliver polygons (<1 m²)")
|
||||
# 7. Remove sliver polygons based on metric area (< 1 m^2)
|
||||
# print("\nStep 7: Remove sliver polygons (<1 m²)")
|
||||
# # use $area now because layer is in meters (EPSG:3857)
|
||||
# slivers = processing.run("native:extractbyexpression", {"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"})["OUTPUT"]
|
||||
# summary["sliver_removed"] = slivers.featureCount()
|
||||
# print(" - Slivers found:", slivers.featureCount())
|
||||
# no_sliver = processing.run(
|
||||
# "native:extractbyexpression",
|
||||
# {
|
||||
# "INPUT": reprojected,
|
||||
# "EXPRESSION": "geometry IS NOT NULL AND $area >= 1",
|
||||
# "OUTPUT": "memory:"
|
||||
# }
|
||||
# )["OUTPUT"]
|
||||
# print(" - Features left after removing slivers:", no_sliver.featureCount())
|
||||
|
||||
slivers = processing.run(
|
||||
"native:extractbyexpression",
|
||||
{"INPUT": reprojected, "EXPRESSION": "$area < 1", "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
# # 8. Remove tiny holes (<1 m^2) — still in metric CRS
|
||||
# print("\nStep 8: Remove tiny holes (<1 m²)")
|
||||
# no_holes = processing.run("native:deleteholes", {"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
# print(" - Features after delete holes:", no_holes.featureCount())
|
||||
# summary["after_deleteholes"] = no_holes.featureCount()
|
||||
|
||||
summary["step7_sliver_removed"] = slivers.featureCount()
|
||||
print(" - Slivers found:", slivers.featureCount())
|
||||
# # Reproject BACK to EPSG:4326 for downstream (GeoServer/PostGIS target)
|
||||
# print("\nStep 9: Reproject back to EPSG:4326")
|
||||
# final_proj = processing.run("native:reprojectlayer", {"INPUT": no_holes, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"})["OUTPUT"]
|
||||
# print(" - Features:", final_proj.featureCount())
|
||||
|
||||
no_sliver = processing.run(
|
||||
"native:extractbyexpression",
|
||||
{"INPUT": reprojected, "EXPRESSION": "$area >= 1", "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
print(" - Features left after removing slivers:", no_sliver.featureCount())
|
||||
|
||||
# ========================================================
|
||||
# 8. REMOVE TINY HOLES (< 1 m2)
|
||||
# ========================================================
|
||||
print("\nStep 8: Remove tiny holes")
|
||||
|
||||
no_holes = processing.run(
|
||||
"native:deleteholes",
|
||||
{"INPUT": no_sliver, "MIN_AREA": 1, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
|
||||
print(" - Features:", no_holes.featureCount())
|
||||
summary["step8_after_deleteholes"] = no_holes.featureCount()
|
||||
|
||||
# ========================================================
|
||||
# FINAL: TRIM STRING FIELDS
|
||||
# ========================================================
|
||||
# Final: Trim string fields
|
||||
print("\nFinal Step: Trim string fields")
|
||||
|
||||
trimmed = processing.run(
|
||||
"qgis:refactorfields",
|
||||
{
|
||||
"INPUT": no_holes,
|
||||
"INPUT": final_proj,
|
||||
"FIELDS_MAPPING": [
|
||||
{
|
||||
"expression": f"trim(\"{field.name()}\")"
|
||||
|
|
@ -603,19 +186,98 @@ def cleansing_layer(layer: QgsVectorLayer) -> Dict:
|
|||
"length": field.length(),
|
||||
"precision": field.precision()
|
||||
}
|
||||
for field in no_holes.fields()
|
||||
for field in final_proj.fields()
|
||||
],
|
||||
"KEEP_GEOMETRY": True, # <--- WAJIB
|
||||
"OUTPUT": "memory:"
|
||||
}
|
||||
)["OUTPUT"]
|
||||
|
||||
|
||||
valid_after = 0
|
||||
for f in trimmed.getFeatures():
|
||||
if f.geometry() is not None and f.geometry().isGeosValid():
|
||||
valid_after += 1
|
||||
summary["valid_after"] = valid_after
|
||||
|
||||
print(" - Final feature count:", trimmed.featureCount())
|
||||
print("========== CLEANSING DONE ==========\n")
|
||||
|
||||
return {
|
||||
"summary": summary,
|
||||
"clean_layer": trimmed
|
||||
return {"summary": summary, "clean_layer": trimmed}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def cleansing_points(layer: QgsVectorLayer):
|
||||
print("\n=== POINT CLEANING PIPELINE ===")
|
||||
|
||||
summary = {
|
||||
"features_before": layer.featureCount(),
|
||||
"invalid_before": 0,
|
||||
"after_fix": 0,
|
||||
"after_dedup": 0,
|
||||
"after_reproject": 0,
|
||||
"valid_after": 0
|
||||
}
|
||||
|
||||
# 1. Check validity (will always return 0 errors for points)
|
||||
validity = processing.run(
|
||||
"qgis:checkvalidity",
|
||||
{"INPUT_LAYER": layer, "METHOD": 2, "VALID_OUTPUT": "memory:", "INVALID_OUTPUT": "memory:", "ERROR_OUTPUT": "memory:"}
|
||||
)
|
||||
invalid = validity["INVALID_OUTPUT"].featureCount()
|
||||
summary["invalid_before"] = invalid
|
||||
print("- Invalid points:", invalid)
|
||||
|
||||
# 2. Fix geometries (safe)
|
||||
fixed = processing.run("native:fixgeometries", {"INPUT": layer, "OUTPUT": "memory:"})["OUTPUT"]
|
||||
summary["after_fix"] = fixed.featureCount()
|
||||
|
||||
# 3. Remove duplicate coordinates (points only)
|
||||
dedup = processing.run(
|
||||
"native:removedduplicategeometries",
|
||||
{"INPUT": fixed, "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
summary["after_dedup"] = dedup.featureCount()
|
||||
|
||||
# 4. Reproject
|
||||
reproject = processing.run(
|
||||
"native:reprojectlayer",
|
||||
{"INPUT": dedup, "TARGET_CRS": "EPSG:4326", "OUTPUT": "memory:"}
|
||||
)["OUTPUT"]
|
||||
summary["after_reproject"] = reproject.featureCount()
|
||||
|
||||
# 5. Trim string fields
|
||||
trimmed = processing.run(
|
||||
"qgis:refactorfields",
|
||||
{
|
||||
"INPUT": reproject,
|
||||
"FIELDS_MAPPING": [
|
||||
{
|
||||
"expression": f"trim(\"{field.name()}\")" if field.typeName().lower() in ["text","varchar"]
|
||||
else f"\"{field.name()}\"",
|
||||
"name": field.name(),
|
||||
"type": field.type(),
|
||||
"length": field.length(),
|
||||
"precision": field.precision(),
|
||||
}
|
||||
for field in reproject.fields()
|
||||
],
|
||||
"KEEP_GEOMETRY": True,
|
||||
"OUTPUT": "memory:"
|
||||
}
|
||||
)["OUTPUT"]
|
||||
|
||||
# 6. Validity check for points (simple)
|
||||
valid_after = 0
|
||||
for f in trimmed.getFeatures():
|
||||
if f.geometry() is not None:
|
||||
valid_after += 1
|
||||
|
||||
summary["valid_after"] = valid_after
|
||||
|
||||
return {"summary": summary, "clean_layer": trimmed}
|
||||
|
|
|
|||
160
main.py
160
main.py
|
|
@ -1,8 +1,20 @@
|
|||
from fastapi import FastAPI, BackgroundTasks
|
||||
from qgis_bootstrap import start_qgis
|
||||
import psycopg2
|
||||
import requests
|
||||
from uuid import uuid4
|
||||
from qgis_bootstrap import start_qgis
|
||||
# from cleansing_service import load_layer, cleansing_layer
|
||||
from full_cleansing_service import load_layer, cleansing_layer
|
||||
from qgis.core import (
|
||||
QgsVectorLayer,
|
||||
QgsVectorLayerExporter,
|
||||
QgsDataSourceUri,
|
||||
QgsProviderRegistry,
|
||||
QgsCoordinateReferenceSystem
|
||||
)
|
||||
from qgis.PyQt.QtCore import QByteArray
|
||||
from core.config import HOST,PORT,DB,USER,PWD,SCHEMA,GEOM_COL
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
|
@ -67,70 +79,116 @@ def run_clean_table(table_name: str, job_id: str):
|
|||
"status": "FINISHED"
|
||||
}
|
||||
|
||||
import requests
|
||||
requests.post(
|
||||
"http://backend-utama:8000/jobs/callback",
|
||||
"http://localhost:8000/jobs/callback",
|
||||
json=callback_payload
|
||||
)
|
||||
|
||||
print(f"=== Cleansing selesai untuk tabel: {table_name} ===\n")
|
||||
|
||||
def to_python(v):
|
||||
# Null
|
||||
if v is None:
|
||||
return None
|
||||
|
||||
# QVariant kosong
|
||||
if hasattr(v, "isNull") and v.isNull():
|
||||
return None
|
||||
|
||||
# Convert QVariant to Python native
|
||||
if hasattr(v, "toPyObject"):
|
||||
return v.toPyObject()
|
||||
|
||||
# Fallback
|
||||
return v
|
||||
|
||||
def save_to_postgis(layer, table_name):
|
||||
host = HOST
|
||||
port = PORT
|
||||
db = DB
|
||||
user = USER
|
||||
pwd = PWD
|
||||
schema = SCHEMA
|
||||
geom_col = GEOM_COL
|
||||
|
||||
srid = layer.crs().postgisSrid()
|
||||
fields = layer.fields()
|
||||
|
||||
|
||||
|
||||
from qgis.core import (
|
||||
QgsVectorLayer,
|
||||
QgsVectorLayerExporter,
|
||||
QgsDataSourceUri
|
||||
# CONNECT
|
||||
conn = psycopg2.connect(
|
||||
dbname=db,
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=pwd
|
||||
)
|
||||
from database import POSTGIS
|
||||
cur = conn.cursor()
|
||||
|
||||
# DROP TABLE
|
||||
cur.execute(f'DROP TABLE IF EXISTS "{schema}"."{table_name}" CASCADE')
|
||||
|
||||
def save_to_postgis(clean_layer: QgsVectorLayer, table_name: str):
|
||||
"""
|
||||
Menghapus isi tabel dan menulis ulang hasil cleansing ke PostGIS.
|
||||
Geometry harus MULTIPOLYGON dan SRID sudah benar.
|
||||
"""
|
||||
# CREATE TABLE
|
||||
field_defs = []
|
||||
for f in fields:
|
||||
if f.name() == geom_col:
|
||||
continue
|
||||
|
||||
print(f"[DB] Menyimpan hasil cleansing ke tabel {table_name}")
|
||||
|
||||
# -------------------------------------------
|
||||
# 1. Build URI PostGIS target
|
||||
# -------------------------------------------
|
||||
uri = QgsDataSourceUri()
|
||||
uri.setConnection(
|
||||
POSTGIS['host'],
|
||||
str(POSTGIS['port']),
|
||||
POSTGIS['db'],
|
||||
POSTGIS['user'],
|
||||
POSTGIS['password']
|
||||
)
|
||||
|
||||
# Nama schema & tabel
|
||||
schema = "public"
|
||||
uri.setDataSource(schema, table_name, "geom") # geometry column = geom
|
||||
|
||||
# -------------------------------------------
|
||||
# 2. Export layer ke PostGIS (replace mode)
|
||||
# -------------------------------------------
|
||||
|
||||
options = QgsVectorLayerExporter.ExportOptions()
|
||||
options.actionOnExistingFile = QgsVectorLayerExporter.ActionOnExistingFile.OverwriteLayer
|
||||
|
||||
err_code, err_msg = QgsVectorLayerExporter.exportLayer(
|
||||
clean_layer, # layer input
|
||||
uri.uri(), # postgis connection uri
|
||||
"postgres", # provider
|
||||
clean_layer.crs(), # CRS layer
|
||||
options
|
||||
)
|
||||
|
||||
if err_code != QgsVectorLayerExporter.NoError:
|
||||
print("[DB][ERROR] Gagal menyimpan:", err_msg)
|
||||
# type mapping
|
||||
t = f.typeName().lower()
|
||||
if "int" in t:
|
||||
pg_type = "INTEGER"
|
||||
elif "double" in t or "float" in t or "real" in t:
|
||||
pg_type = "DOUBLE PRECISION"
|
||||
else:
|
||||
print("[DB] Berhasil update tabel", table_name)
|
||||
pg_type = "TEXT"
|
||||
|
||||
col = f.name().replace(" ", "_")
|
||||
field_defs.append(f'"{col}" {pg_type}')
|
||||
|
||||
# geometry column
|
||||
field_defs.append(f'"{geom_col}" geometry(MultiPolygon,{srid})')
|
||||
|
||||
create_sql = f'CREATE TABLE "{schema}"."{table_name}" ({",".join(field_defs)});'
|
||||
cur.execute(create_sql)
|
||||
|
||||
# Prepare INSERT
|
||||
attribute_columns = [
|
||||
f'"{f.name().replace(" ", "_")}"'
|
||||
for f in fields if f.name() != geom_col
|
||||
]
|
||||
insert_columns = attribute_columns + [f'"{geom_col}"']
|
||||
placeholders = ["%s"] * len(insert_columns)
|
||||
|
||||
insert_sql = f"""
|
||||
INSERT INTO "{schema}"."{table_name}"
|
||||
({",".join(insert_columns)})
|
||||
VALUES ({",".join(placeholders)})
|
||||
"""
|
||||
|
||||
# INSERT ROWS
|
||||
count = 0
|
||||
for feat in layer.getFeatures():
|
||||
attrs = feat.attributes()
|
||||
|
||||
row = []
|
||||
for f, v in zip(fields, attrs):
|
||||
if f.name() != geom_col:
|
||||
row.append(to_python(v))
|
||||
|
||||
geom = feat.geometry()
|
||||
wkb_bytes = geom.asWkb()
|
||||
if isinstance(wkb_bytes, QByteArray):
|
||||
wkb_bytes = bytes(wkb_bytes)
|
||||
|
||||
row.append(psycopg2.Binary(wkb_bytes))
|
||||
cur.execute(insert_sql, row)
|
||||
count += 1
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"[DB] Inserted features: {count}")
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ os.environ["QT_QPA_PLATFORM"] = "offscreen"
|
|||
sys.path.append(f"{QGIS_PREFIX}/python")
|
||||
sys.path.append(f"{QGIS_PREFIX}/python/plugins")
|
||||
|
||||
from qgis.core import QgsApplication
|
||||
from qgis.core import QgsApplication, QgsProviderRegistry
|
||||
from qgis.analysis import QgsNativeAlgorithms
|
||||
|
||||
import processing
|
||||
|
|
@ -29,52 +29,7 @@ def start_qgis():
|
|||
|
||||
# === WAJIB: initialize processing ===
|
||||
Processing.initialize()
|
||||
QgsProviderRegistry.instance()
|
||||
qgs.processingRegistry().addProvider(QgsNativeAlgorithms())
|
||||
|
||||
return qgs
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# DEPLOYMENT
|
||||
# import os
|
||||
# import sys
|
||||
|
||||
# # QGIS environment
|
||||
# os.environ["QGIS_PREFIX_PATH"] = "/usr"
|
||||
# os.environ["QGIS_HOME"] = "/usr"
|
||||
|
||||
# os.environ["PROJ_LIB"] = "/usr/share/proj"
|
||||
# os.environ["GDAL_DATA"] = "/usr/share/gdal"
|
||||
# os.environ["QT_PLUGIN_PATH"] = "/usr/lib/x86_64-linux-gnu/qt5/plugins"
|
||||
|
||||
# os.environ["QT_QPA_PLATFORM"] = "offscreen"
|
||||
|
||||
# # QGIS Python plugins (THIS IS THE MISSING PART)
|
||||
# sys.path.append("/usr/share/qgis/python")
|
||||
# sys.path.append("/usr/share/qgis/python/plugins")
|
||||
|
||||
# # Python modules (from system)
|
||||
# sys.path.append("/usr/lib/python3/dist-packages")
|
||||
# sys.path.append("/usr/lib/python3/dist-packages/qgis")
|
||||
|
||||
|
||||
# from qgis.core import QgsApplication
|
||||
# from qgis.analysis import QgsNativeAlgorithms
|
||||
# import processing
|
||||
# from processing.core.Processing import Processing
|
||||
|
||||
# def start_qgis():
|
||||
# qgs = QgsApplication([], False)
|
||||
# qgs.initQgis()
|
||||
# Processing.initialize()
|
||||
# qgs.processingRegistry().addProvider(QgsNativeAlgorithms())
|
||||
# return qgs
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user