File size: 7,470 Bytes
6779194
c04a9f1
943c180
 
 
a2eab18
943c180
 
 
 
 
 
6bbfa30
943c180
 
 
6bbfa30
c04a9f1
 
 
943c180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c04a9f1
 
 
 
6bbfa30
c04a9f1
 
943c180
c04a9f1
 
943c180
bfc6129
943c180
 
bfc6129
943c180
bfc6129
 
943c180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bbfa30
943c180
 
 
 
 
 
 
 
6bbfa30
943c180
 
 
 
 
 
6bbfa30
943c180
 
6bbfa30
943c180
 
 
 
 
6bbfa30
c04a9f1
 
6bbfa30
 
 
 
c04a9f1
6bbfa30
74ccf3a
c04a9f1
6bbfa30
c04a9f1
943c180
 
 
 
 
 
 
 
 
 
 
c04a9f1
6bbfa30
943c180
 
 
c04a9f1
 
 
6bbfa30
 
c04a9f1
6bbfa30
c04a9f1
943c180
4860836
c04a9f1
943c180
c04a9f1
 
6bbfa30
 
c04a9f1
 
943c180
a2eab18
943c180
c04a9f1
 
 
 
 
6bbfa30
c04a9f1
 
6bbfa30
c04a9f1
 
 
 
 
 
 
 
 
 
 
943c180
 
 
 
 
 
 
c04a9f1
 
 
6bbfa30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import logging
import google.generativeai as genai
import spacy
import nltk
import gradio as gr
import pandas as pd
import re
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import fitz
from typing import List, Dict, Set
import docx
import tempfile
from pathlib import Path

class ResumeAnalyzer:
    def __init__(self):
        self._initialize_logging()
        self._initialize_nltk()
        self._initialize_spacy()
        self._setup_api_key()
        
    def _initialize_logging(self):
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )

    def _initialize_nltk(self) -> None:
        try:
            nltk.data.path.append(os.getcwd())
            for resource in ['punkt', 'stopwords', 'wordnet']:
                try:
                    nltk.data.find(f'tokenizers/{resource}')
                except LookupError:
                    nltk.download(resource, quiet=True)
            self.stop_words = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
        except Exception as e:
            self.logger.error(f"Failed to initialize NLTK: {str(e)}")
            raise

    def _initialize_spacy(self) -> None:
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            self.logger.info("Downloading spaCy model...")
            import subprocess
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
            self.nlp = spacy.load("en_core_web_sm")

    def _setup_api_key(self) -> None:
        try:
            self.google_api_key = os.environ.get("GOOGLE_API_KEY")
            if not self.google_api_key:
                raise ValueError("GOOGLE_API_KEY not found in environment variables")
            genai.configure(api_key=self.google_api_key)
        except Exception as e:
            self.logger.error(f"Failed to setup API key: {str(e)}")
            raise

    def extract_text_from_pdf(self, file_path: str) -> str:
        try:
            with fitz.open(file_path) as doc:
                text = " ".join(page.get_text() for page in doc)
            return text
        except Exception as e:
            self.logger.error(f"Error extracting text from PDF: {str(e)}")
            return ""

    def extract_text_from_docx(self, file_path: str) -> str:
        try:
            doc = docx.Document(file_path)
            return "\n".join(para.text for para in doc.paragraphs)
        except Exception as e:
            self.logger.error(f"Error extracting text from DOCX: {str(e)}")
            return ""

    def preprocess_text(self, text: str) -> str:
        try:
            text = text.lower()
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'[^a-z0-9\s]', '', text)
            tokens = word_tokenize(text)
            tokens = [self.lemmatizer.lemmatize(word) 
                     for word in tokens 
                     if word not in self.stop_words]
            return " ".join(tokens)
        except Exception as e:
            self.logger.error(f"Error in text preprocessing: {str(e)}")
            return text

    def extract_named_entities(self, text: str) -> Set[str]:
        try:
            doc = self.nlp(text[:100000])
            return {ent.text.lower() for ent in doc.ents}
        except Exception as e:
            self.logger.error(f"Error in named entity extraction: {str(e)}")
            return set()

    def calculate_match_percentage(self, resume_text: str, job_desc_text: str) -> float:
        try:
            resume_text = self.preprocess_text(resume_text)
            job_desc_text = self.preprocess_text(job_desc_text)
            return fuzz.token_set_ratio(resume_text, job_desc_text)
        except Exception as e:
            self.logger.error(f"Error calculating match percentage: {str(e)}")
            return 0.0

    def gemini_analysis(self, text: str) -> str:
        try:
            model = genai.GenerativeModel('gemini-pro')
            prompt = f"""Analyze this resume text and provide a brief summary of key skills and experience:
                        {text[:1000]}..."""
            response = model.generate_content(prompt)
            return response.text
        except Exception as e:
            self.logger.error(f"Error in Gemini analysis: {str(e)}")
            return "AI analysis failed"

    def process_file(self, file_path: str, job_desc: str) -> dict:
        try:
            # Extract text based on file type
            if file_path.lower().endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif file_path.lower().endswith('.docx'):
                text = self.extract_text_from_docx(file_path)
            else:
                return {"Resume": Path(file_path).name, "Match Percentage": "Invalid File Type"}

            if not text.strip():
                return {"Resume": Path(file_path).name, "Match Percentage": "No text extracted"}

            entities = self.extract_named_entities(text)
            job_entities = self.extract_named_entities(job_desc)
            
            entity_match = (
                len(entities.intersection(job_entities)) / len(job_entities) * 100
                if job_entities else 0
            )
            
            match_percentage = self.calculate_match_percentage(text, job_desc)
            gemini_analysis = self.gemini_analysis(text)

            return {
                "Resume": Path(file_path).name,
                "Match Percentage": round(match_percentage, 2),
                "Entity Match (%)": round(entity_match, 2),
                "AI Analysis": gemini_analysis
            }

        except Exception as e:
            self.logger.error(f"Error processing file {file_path}: {str(e)}")
            return {"Resume": Path(file_path).name, "Error": str(e)}

    def process_uploaded_resumes(self, resume_files: List[str], job_desc: str) -> pd.DataFrame:
        if not resume_files:
            return pd.DataFrame({"Message": ["Please upload at least one resume."]})
        
        if not job_desc.strip():
            return pd.DataFrame({"Message": ["Please provide a job description."]})

        results = []
        for file_path in resume_files:
            result = self.process_file(file_path, job_desc)
            results.append(result)

        return pd.DataFrame(results)

# Create the Gradio interface
analyzer = ResumeAnalyzer()

interface = gr.Interface(
    fn=analyzer.process_uploaded_resumes,
    inputs=[
        gr.File(
            label="Upload Resumes (PDF or DOCX)",
            file_types=[".pdf", ".docx"],
            multiple=True
        ),
        gr.Textbox(
            label="Job Description",
            placeholder="Paste the job description here...",
            lines=6
        )
    ],
    outputs=gr.DataFrame(label="Analysis Results"),
    title="AI Resume Analyzer",
    description="""
    Upload resumes (PDF or DOCX) and provide a job description to see how well they match.
    The analysis includes:
    - Overall match percentage
    - Key skills and experience matching
    - AI-powered resume analysis
    """,
    allow_flagging="never",
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    interface.launch()