bangaboy commited on
Commit
6e31617
·
verified ·
1 Parent(s): cd743ff

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pyngrok import ngrok
3
+ import google.generativeai as genai
4
+ import fitz # PyMuPDF for PDF text extraction
5
+ import spacy
6
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+ from docx import Document
9
+ import re
10
+ from nltk.corpus import words
11
+ import dateparser
12
+ from datetime import datetime
13
+ import os
14
+
15
+ # Replace with your ngrok auth token
16
+ ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM")
17
+ url = ngrok.connect(8501)
18
+ print(f"Public URL: {url}")
19
+
20
+ # Load SpaCy model
21
+ nlp_spacy = spacy.load('en_core_web_sm')
22
+
23
+ # Load Babelscape NER model
24
+ tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
25
+ model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
26
+ nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="simple")
27
+
28
+ # Load GLinER model
29
+ gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
30
+ gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")
31
+
32
+ class EnhancedNERPipeline:
33
+ def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
34
+ self.nlp_spacy = nlp_spacy
35
+ self.nlp_ner = nlp_ner
36
+ self.gliner_model = gliner_model
37
+ self.gliner_tokenizer = gliner_tokenizer
38
+
39
+ def __call__(self, text):
40
+ doc = self.nlp_spacy(text)
41
+ ner_results = self.nlp_ner(text)
42
+ gliner_companies = extract_info_with_gliner(text, "company names")
43
+ gliner_experience = extract_info_with_gliner(text, "years of experience")
44
+ gliner_education = extract_info_with_gliner(text, "educational institutions")
45
+
46
+ combined_entities = doc.ents + tuple(ner_results)
47
+ doc._.gliner_companies = gliner_companies.split(', ')
48
+ doc._.gliner_experience = gliner_experience
49
+ doc._.gliner_education = gliner_education.split(', ')
50
+ doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]
51
+ return doc
52
+
53
+ def extract_info_with_gliner(text, info_type):
54
+ input_text = f"Extract {info_type} from: {text}"
55
+ input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
56
+ outputs = gliner_model.generate(input_ids, max_length=100)
57
+ return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)
58
+
59
+ # Create the enhanced pipeline
60
+ enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)
61
+
62
+ def extract_companies(doc):
63
+ gliner_companies = set(doc._.gliner_companies)
64
+ spacy_babelscape_companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
65
+ return list(gliner_companies.union(spacy_babelscape_companies))
66
+
67
+ def extract_experience(doc):
68
+ gliner_experience = int(re.search(r'\d+', doc._.gliner_experience).group()) if doc._.gliner_experience else 0
69
+ spacy_experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
70
+ return max(gliner_experience, spacy_experience)
71
+
72
+ def extract_education(doc):
73
+ gliner_education = set(doc._.gliner_education)
74
+ spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
75
+ return list(gliner_education.union(spacy_babelscape_education))
76
+
77
+ def main():
78
+ st.title("Enhanced Resume Analyzer with GLinER Focus")
79
+
80
+ api_key = st.text_input("Enter your Google Gemini API key", type="password")
81
+ uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])
82
+
83
+ if uploaded_file is not None and api_key:
84
+ try:
85
+ model = authenticate_gemini(api_key)
86
+ if model is None:
87
+ return
88
+
89
+ if uploaded_file.type == "application/pdf":
90
+ resume_text = extract_text_from_pdf(uploaded_file)
91
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
92
+ resume_text = extract_text_from_doc(uploaded_file)
93
+ else:
94
+ st.error("Unsupported file format.")
95
+ return
96
+
97
+ doc = enhanced_nlp(resume_text)
98
+
99
+ companies = extract_companies(doc)
100
+ experience = extract_experience(doc)
101
+ education = extract_education(doc)
102
+
103
+ phone = extract_info_with_gliner(resume_text, "phone number")
104
+ email = extract_info_with_gliner(resume_text, "email address")
105
+ linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")
106
+
107
+ st.subheader("Extracted Information")
108
+ st.write(f"**Years of Experience:** {experience}")
109
+ st.write("**Companies:**", ", ".join(companies))
110
+ st.write("**Education:**", ", ".join(education))
111
+ st.write(f"**Phone Number:** {phone}")
112
+ st.write(f"**Email:** {email}")
113
+ st.write(f"**LinkedIn:** {linkedin}")
114
+
115
+ summary = generate_summary(resume_text, model)
116
+ st.subheader("Resume Summary")
117
+ st.write(summary)
118
+
119
+ except Exception as e:
120
+ st.error(f"Error during processing: {e}")
121
+
122
+ if __name__ == "__main__":
123
+ main()