diff --git a/services/filter_column.py b/services/filter_column.py new file mode 100644 index 0000000..b814503 --- /dev/null +++ b/services/filter_column.py @@ -0,0 +1,47 @@ +import re +import itertools + +geo_admin_keywords = [ + 'lat', 'lon', 'long', 'latitude', 'longitude', 'koordinat', 'geometry', 'geometri', + 'desa', 'kelurahan', 'kel', 'kecamatan', 'kabupaten', 'kab', 'kota', 'provinsi', + 'lokasi', 'region', 'area', 'zone', 'boundary', 'batas' +] + +def normalize_text(text): + text = text.lower() + text = re.sub(r'[^a-z0-9/ ]+', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + return text + +def generate_combined_patterns(keywords): + combos = list(itertools.combinations(keywords, 2)) + patterns = [] + for a, b in combos: + patterns.append(rf'{a}\s*/\s*{b}') + patterns.append(rf'{b}\s*/\s*{a}') + return patterns + +combined_patterns = generate_combined_patterns(geo_admin_keywords) + +def contains_geo_admin_keywords(text): + text_clean = normalize_text(text) + if len(text_clean) < 3: + return False + + for pattern in combined_patterns: + if re.search(pattern, text_clean): + return True + + for kw in geo_admin_keywords: + if re.search(rf'(^|[\s/_-]){kw}([\s/_-]|$)', text_clean): + return True + + return False + +def filter_geo_admin_column(tables): + filtered = [] + for table in tables: + found = any(contains_geo_admin_keywords(col) for col in table['columns']) + if found: + filtered.append(table) + return filtered