import pandas as pd import re from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS from utils.console_manager import console_manager # Paths RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv" PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv" PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet" def load_filtered(): usecols = ["id", "title", "abstract", "categories", "update_date"] df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False) df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce") df = df[df["update_date"].dt.year >= MIN_YEAR].copy() df["year"] = df["update_date"].dt.year df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True) return df def clean_text(text: str) -> str: if not isinstance(text, str): return "" text = text.replace("\n", " ").replace("\r", " ") text = re.sub(r"\s+", " ", text) return text.strip() def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame: if text_columns is None: text_columns = ["title", "abstract"] for col in text_columns: df[col] = df[col].apply(clean_text) return df def preprocess_and_save(): try: if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists(): console_manager.print_info( f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing." ) df = pd.read_parquet(PROCESSED_PARQUET) console_manager.print_info( f"Loaded existing processed data ({df.shape[0]} rows)." ) return df with console_manager.status("Processing file...") as status: df = load_filtered() df = clean_dataframe(df) if MAX_TEXTS is not None: df = df.head(MAX_TEXTS) console_manager.print_info( f"Limiting dataset to {len(df)} rows for testing." ) df.to_csv(PROCESSED_CSV, index=False) df.to_parquet(PROCESSED_PARQUET, index=False) console_manager.print_success( f"Pŕe-processing complete. File save in: {PROCESSED_DIR} " ) except Exception as e: console_manager.print_error(f"Pré-processing failed: {e}") return None return df