2025-11-04 06:33:17 +00:00
|
|
|
# import pandas as pd
|
|
|
|
|
# import re
|
|
|
|
|
# import csv
|
|
|
|
|
# import os
|
|
|
|
|
|
|
|
|
|
# def detect_header_line(path, max_rows=10):
|
|
|
|
|
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
|
|
|
# lines = [next(f) for _ in range(max_rows)]
|
|
|
|
|
|
|
|
|
|
# header_line_idx = 0
|
|
|
|
|
# best_score = -1
|
|
|
|
|
|
|
|
|
|
# for i, line in enumerate(lines):
|
|
|
|
|
# cells = re.split(r'[;,|\t]', line.strip())
|
|
|
|
|
# alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
|
|
|
|
# digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
|
|
|
|
# score = alpha_ratio - digit_ratio
|
|
|
|
|
|
|
|
|
|
# if score > best_score:
|
|
|
|
|
# best_score = score
|
|
|
|
|
# header_line_idx = i
|
|
|
|
|
|
|
|
|
|
# return header_line_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def detect_delimiter(path, sample_size=2048):
|
|
|
|
|
# with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
|
|
|
# sample = f.read(sample_size)
|
|
|
|
|
# sniffer = csv.Sniffer()
|
|
|
|
|
# try:
|
|
|
|
|
# dialect = sniffer.sniff(sample)
|
|
|
|
|
# return dialect.delimiter
|
|
|
|
|
# except Exception:
|
|
|
|
|
# for delim in [',', ';', '\t', '|']:
|
|
|
|
|
# if delim in sample:
|
|
|
|
|
# return delim
|
|
|
|
|
# return ','
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def read_csv(path: str):
|
|
|
|
|
# ext = os.path.splitext(path)[1].lower() # ambil ekstensi file
|
|
|
|
|
|
|
|
|
|
# try:
|
|
|
|
|
# if ext in ['.csv', '.txt']:
|
|
|
|
|
# # === Baca file CSV ===
|
|
|
|
|
# header_line = detect_header_line(path)
|
|
|
|
|
# delimiter = detect_delimiter(path)
|
|
|
|
|
# print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
|
|
|
|
|
|
|
|
|
# df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
|
|
|
|
|
|
|
|
|
# elif ext in ['.xlsx', '.xls']:
|
|
|
|
|
# # === Baca file Excel ===
|
|
|
|
|
# print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
|
|
|
|
# pre_df = pd.read_excel(path, header=0, dtype=str) # baca semua sebagai string
|
|
|
|
|
# df = pre_df.copy()
|
|
|
|
|
# for col in df.columns:
|
|
|
|
|
# if df[col].str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
|
|
|
|
# df[col] = df[col].str.replace(',', '', regex=False)
|
|
|
|
|
# df[col] = pd.to_numeric(df[col], errors='ignore')
|
|
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
# raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
|
|
|
|
|
|
|
|
|
# except Exception as e:
|
|
|
|
|
# print(f"[WARN] Gagal membaca file ({e}), fallback ke default")
|
|
|
|
|
# df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
|
|
|
|
|
|
|
|
|
# # Bersihkan kolom dan baris kosong
|
|
|
|
|
# df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
|
|
|
|
# df.columns = [str(c).strip() for c in df.columns]
|
|
|
|
|
# df = df.dropna(how='all')
|
|
|
|
|
|
|
|
|
|
# return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-10-29 10:07:48 +00:00
|
|
|
import pandas as pd
|
|
|
|
|
import re
|
2025-10-30 08:38:20 +00:00
|
|
|
import csv
|
|
|
|
|
import os
|
2025-10-29 10:07:48 +00:00
|
|
|
|
|
|
|
|
def detect_header_line(path, max_rows=10):
|
|
|
|
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
|
|
|
lines = [next(f) for _ in range(max_rows)]
|
|
|
|
|
header_line_idx = 0
|
|
|
|
|
best_score = -1
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
cells = re.split(r'[;,|\t]', line.strip())
|
|
|
|
|
alpha_ratio = sum(bool(re.search(r'[A-Za-z]', c)) for c in cells) / max(len(cells), 1)
|
|
|
|
|
digit_ratio = sum(bool(re.search(r'\d', c)) for c in cells) / max(len(cells), 1)
|
2025-11-04 06:33:17 +00:00
|
|
|
score = alpha_ratio - digit_ratio
|
2025-10-29 10:07:48 +00:00
|
|
|
if score > best_score:
|
|
|
|
|
best_score = score
|
|
|
|
|
header_line_idx = i
|
|
|
|
|
return header_line_idx
|
|
|
|
|
|
2025-10-30 08:38:20 +00:00
|
|
|
def detect_delimiter(path, sample_size=2048):
|
|
|
|
|
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
|
|
|
sample = f.read(sample_size)
|
|
|
|
|
sniffer = csv.Sniffer()
|
|
|
|
|
try:
|
|
|
|
|
dialect = sniffer.sniff(sample)
|
|
|
|
|
return dialect.delimiter
|
|
|
|
|
except Exception:
|
|
|
|
|
for delim in [',', ';', '\t', '|']:
|
|
|
|
|
if delim in sample:
|
|
|
|
|
return delim
|
|
|
|
|
return ','
|
|
|
|
|
|
2025-10-29 10:07:48 +00:00
|
|
|
def read_csv(path: str):
|
2025-11-04 06:33:17 +00:00
|
|
|
ext = os.path.splitext(path)[1].lower()
|
2025-10-30 08:38:20 +00:00
|
|
|
|
2025-10-29 10:07:48 +00:00
|
|
|
try:
|
2025-10-30 08:38:20 +00:00
|
|
|
if ext in ['.csv', '.txt']:
|
|
|
|
|
# === Baca file CSV ===
|
|
|
|
|
header_line = detect_header_line(path)
|
|
|
|
|
delimiter = detect_delimiter(path)
|
|
|
|
|
print(f"[INFO] Detected header line: {header_line + 1}, delimiter: '{delimiter}'")
|
|
|
|
|
|
2025-10-30 10:14:53 +00:00
|
|
|
df = pd.read_csv(path, header=header_line, sep=delimiter, encoding='utf-8', low_memory=False, thousands=',')
|
2025-10-30 08:38:20 +00:00
|
|
|
|
|
|
|
|
elif ext in ['.xlsx', '.xls']:
|
|
|
|
|
# === Baca file Excel ===
|
|
|
|
|
print(f"[INFO] Membaca file Excel: {os.path.basename(path)}")
|
2025-11-04 06:33:17 +00:00
|
|
|
xls = pd.ExcelFile(path)
|
|
|
|
|
|
|
|
|
|
print(f"[INFO] Ditemukan {len(xls.sheet_names)} sheet: {xls.sheet_names}")
|
|
|
|
|
|
|
|
|
|
# Evaluasi tiap sheet untuk mencari yang paling relevan
|
|
|
|
|
best_sheet = None
|
|
|
|
|
best_score = -1
|
|
|
|
|
best_df = None
|
|
|
|
|
|
|
|
|
|
for sheet_name in xls.sheet_names:
|
|
|
|
|
try:
|
|
|
|
|
df = pd.read_excel(xls, sheet_name=sheet_name, header=0, dtype=str)
|
|
|
|
|
df = df.dropna(how='all').dropna(axis=1, how='all')
|
|
|
|
|
|
|
|
|
|
if len(df) == 0 or len(df.columns) < 2:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# hitung "skor relevansi"
|
|
|
|
|
text_ratio = df.applymap(lambda x: isinstance(x, str)).sum().sum() / (df.size or 1)
|
|
|
|
|
row_score = len(df)
|
|
|
|
|
score = (row_score * 0.7) + (text_ratio * 100)
|
|
|
|
|
|
|
|
|
|
if score > best_score:
|
|
|
|
|
best_score = score
|
|
|
|
|
best_sheet = sheet_name
|
|
|
|
|
best_df = df
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"[WARN] Gagal membaca sheet {sheet_name}: {e}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if best_df is not None:
|
|
|
|
|
print(f"[INFO] Sheet terpilih: '{best_sheet}' dengan skor {best_score:.2f}")
|
|
|
|
|
df = best_df
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Tidak ada sheet valid yang dapat dibaca.")
|
|
|
|
|
|
|
|
|
|
# Konversi tipe numerik jika ada
|
2025-10-30 10:14:53 +00:00
|
|
|
for col in df.columns:
|
2025-11-04 06:33:17 +00:00
|
|
|
if df[col].astype(str).str.replace(',', '', regex=False).str.match(r'^-?\d+(\.\d+)?$').any():
|
|
|
|
|
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
|
2025-10-30 10:14:53 +00:00
|
|
|
df[col] = pd.to_numeric(df[col], errors='ignore')
|
2025-10-30 08:38:20 +00:00
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Format file tidak dikenali (hanya .csv, .txt, .xlsx, .xls)")
|
|
|
|
|
|
2025-10-29 10:07:48 +00:00
|
|
|
except Exception as e:
|
2025-11-04 06:33:17 +00:00
|
|
|
print(f"[WARN] Gagal membaca file ({e}), fallback ke default reader.")
|
2025-10-30 10:14:53 +00:00
|
|
|
df = pd.read_csv(path, encoding='utf-8', low_memory=False, thousands=',')
|
2025-10-29 10:07:48 +00:00
|
|
|
|
2025-10-30 08:38:20 +00:00
|
|
|
# Bersihkan kolom dan baris kosong
|
|
|
|
|
df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed')]
|
2025-10-29 10:07:48 +00:00
|
|
|
df.columns = [str(c).strip() for c in df.columns]
|
|
|
|
|
df = df.dropna(how='all')
|
|
|
|
|
|
2025-11-04 06:33:17 +00:00
|
|
|
return df
|