New-1 / app.py
DreamStream-1's picture
Update app.py
6bbfa30 verified
import os
import logging
import google.generativeai as genai
import spacy
import nltk
import gradio as gr
import pandas as pd
import re
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import fitz
from typing import List, Dict, Set
import docx
import tempfile
from pathlib import Path
class ResumeAnalyzer:
def __init__(self):
self._initialize_logging()
self._initialize_nltk()
self._initialize_spacy()
self._setup_api_key()
def _initialize_logging(self):
self.logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def _initialize_nltk(self) -> None:
try:
nltk.data.path.append(os.getcwd())
for resource in ['punkt', 'stopwords', 'wordnet']:
try:
nltk.data.find(f'tokenizers/{resource}')
except LookupError:
nltk.download(resource, quiet=True)
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
except Exception as e:
self.logger.error(f"Failed to initialize NLTK: {str(e)}")
raise
def _initialize_spacy(self) -> None:
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
self.logger.info("Downloading spaCy model...")
import subprocess
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
self.nlp = spacy.load("en_core_web_sm")
def _setup_api_key(self) -> None:
try:
self.google_api_key = os.environ.get("GOOGLE_API_KEY")
if not self.google_api_key:
raise ValueError("GOOGLE_API_KEY not found in environment variables")
genai.configure(api_key=self.google_api_key)
except Exception as e:
self.logger.error(f"Failed to setup API key: {str(e)}")
raise
def extract_text_from_pdf(self, file_path: str) -> str:
try:
with fitz.open(file_path) as doc:
text = " ".join(page.get_text() for page in doc)
return text
except Exception as e:
self.logger.error(f"Error extracting text from PDF: {str(e)}")
return ""
def extract_text_from_docx(self, file_path: str) -> str:
try:
doc = docx.Document(file_path)
return "\n".join(para.text for para in doc.paragraphs)
except Exception as e:
self.logger.error(f"Error extracting text from DOCX: {str(e)}")
return ""
def preprocess_text(self, text: str) -> str:
try:
text = text.lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-z0-9\s]', '', text)
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(word)
for word in tokens
if word not in self.stop_words]
return " ".join(tokens)
except Exception as e:
self.logger.error(f"Error in text preprocessing: {str(e)}")
return text
def extract_named_entities(self, text: str) -> Set[str]:
try:
doc = self.nlp(text[:100000])
return {ent.text.lower() for ent in doc.ents}
except Exception as e:
self.logger.error(f"Error in named entity extraction: {str(e)}")
return set()
def calculate_match_percentage(self, resume_text: str, job_desc_text: str) -> float:
try:
resume_text = self.preprocess_text(resume_text)
job_desc_text = self.preprocess_text(job_desc_text)
return fuzz.token_set_ratio(resume_text, job_desc_text)
except Exception as e:
self.logger.error(f"Error calculating match percentage: {str(e)}")
return 0.0
def gemini_analysis(self, text: str) -> str:
try:
model = genai.GenerativeModel('gemini-pro')
prompt = f"""Analyze this resume text and provide a brief summary of key skills and experience:
{text[:1000]}..."""
response = model.generate_content(prompt)
return response.text
except Exception as e:
self.logger.error(f"Error in Gemini analysis: {str(e)}")
return "AI analysis failed"
def process_file(self, file_path: str, job_desc: str) -> dict:
try:
# Extract text based on file type
if file_path.lower().endswith('.pdf'):
text = self.extract_text_from_pdf(file_path)
elif file_path.lower().endswith('.docx'):
text = self.extract_text_from_docx(file_path)
else:
return {"Resume": Path(file_path).name, "Match Percentage": "Invalid File Type"}
if not text.strip():
return {"Resume": Path(file_path).name, "Match Percentage": "No text extracted"}
entities = self.extract_named_entities(text)
job_entities = self.extract_named_entities(job_desc)
entity_match = (
len(entities.intersection(job_entities)) / len(job_entities) * 100
if job_entities else 0
)
match_percentage = self.calculate_match_percentage(text, job_desc)
gemini_analysis = self.gemini_analysis(text)
return {
"Resume": Path(file_path).name,
"Match Percentage": round(match_percentage, 2),
"Entity Match (%)": round(entity_match, 2),
"AI Analysis": gemini_analysis
}
except Exception as e:
self.logger.error(f"Error processing file {file_path}: {str(e)}")
return {"Resume": Path(file_path).name, "Error": str(e)}
def process_uploaded_resumes(self, resume_files: List[str], job_desc: str) -> pd.DataFrame:
if not resume_files:
return pd.DataFrame({"Message": ["Please upload at least one resume."]})
if not job_desc.strip():
return pd.DataFrame({"Message": ["Please provide a job description."]})
results = []
for file_path in resume_files:
result = self.process_file(file_path, job_desc)
results.append(result)
return pd.DataFrame(results)
# Create the Gradio interface
analyzer = ResumeAnalyzer()
interface = gr.Interface(
fn=analyzer.process_uploaded_resumes,
inputs=[
gr.File(
label="Upload Resumes (PDF or DOCX)",
file_types=[".pdf", ".docx"],
multiple=True
),
gr.Textbox(
label="Job Description",
placeholder="Paste the job description here...",
lines=6
)
],
outputs=gr.DataFrame(label="Analysis Results"),
title="AI Resume Analyzer",
description="""
Upload resumes (PDF or DOCX) and provide a job description to see how well they match.
The analysis includes:
- Overall match percentage
- Key skills and experience matching
- AI-powered resume analysis
""",
allow_flagging="never",
theme=gr.themes.Soft()
)
if __name__ == "__main__":
interface.launch()