Abiyasa_Putra_Prasetya/Model LLM/fastapi-llama/utils_file.py

61 lines
1.9 KiB
Python

import os
from docx import Document
import fitz
def read_text_from_file(filepath: str, start_page: int = 0, max_chars: int = 3000) -> str:
ext = os.path.splitext(filepath)[1].lower()
try:
if ext == '.pdf':
doc = fitz.open(filepath)
if start_page >= len(doc):
return ""
text = ""
for page_num in range(start_page, len(doc)):
text += doc[page_num].get_text()
if len(text) >= max_chars:
break
return text[:max_chars]
elif ext == '.docx':
doc = Document(filepath)
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
if not paragraphs:
return ""
para_per_page = 20
if start_page > 0:
start_index = start_page * para_per_page
if start_index >= len(paragraphs):
return ""
selected_paragraphs = paragraphs[start_index:]
else:
selected_paragraphs = paragraphs
combined_text = "\n".join(selected_paragraphs)
return combined_text[:max_chars]
elif ext == '.txt':
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
return ""
lines_per_page = 40
if start_page > 0:
start_index = start_page * lines_per_page
if start_index >= len(lines):
return ""
selected_lines = lines[start_index:]
else:
selected_lines = lines
combined_text = "".join(selected_lines)
return combined_text[:max_chars]
else:
raise Exception(f"Format file tidak didukung: {ext}")
except Exception as e:
raise Exception(f"Gagal membaca file {ext.upper()}: {e}")