resume_API / app.py
cdcvd's picture
Upload 5 files
7b059c6 verified
raw
history blame
6.26 kB
import pandas as pd
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import gradio as gr
def read_from_excel(file_path):
df = pd.read_excel(file_path)
items = df['object'].astype(str).tolist() # تبدیل همه مقادیر به رشته
return items
def preprocess_text(text):
# حذف کاراکترهای غیرضروری و نرمال‌سازی متن
text = text.replace('\u200c', ' ').strip() # حذف نیم‌فاصله و فاصله‌های اضافی
text = re.sub(r'\s+', ' ', text) # حذف فاصله‌های تکراری
return text
def extract_items_in_text(text, items):
text = preprocess_text(text)
found_items = set() # استفاده از مجموعه برای جلوگیری از تکرار
for item in items:
item_normalized = preprocess_text(item)
if item_normalized.lower() in text.lower():
found_items.add(item_normalized)
return list(found_items)
def compare_items(items_1, items_2):
common_items = set()
score = 0 # مقدار پیش‌فرض برای score
for item1 in items_1:
for item2 in items_2:
words1 = set(item1.lower().split())
words2 = set(item2.lower().split())
common_words = words1.intersection(words2)
num_common = len(common_words)
if num_common >= 3:
common_items.add((item1, item2))
score = 100
elif num_common == 2:
common_items.add((item1, item2))
score = 75
elif num_common == 1:
common_items.add((item1, item2))
score = 50
return score, common_items
def compare_skills(skill_1, skill_2):
common_skill = set(skill_1).intersection(set(skill_2))
num_common = len(common_skill)
if num_common >= 10:
score = 100
elif num_common == 7:
score = 75
elif num_common == 5:
score = 50
else:
score = 25
return score, common_skill
def extract_ner_info(text, nlp):
ner_results = nlp(text)
full_name = ''
loc = ''
age = None
for i in range(len(ner_results)):
if ner_results[i]['entity'] == 'B-PER':
full_name = ner_results[i]['word']
for j in range(i+1, len(ner_results)):
if ner_results[j]['entity'].startswith('I-PER'):
full_name += ner_results[j]['word'].replace('##', '')
else:
break
if ner_results[i]['entity'] == 'B-LOC' and not loc:
loc = ner_results[i]['word']
age_match = re.search(r'سن\s*:\s*(\d+)', text)
if age_match:
age = int(age_match.group(1))
return full_name, loc, age
def process_text(input_text):
# مسیر فایل اکسل‌ها را وارد کنید
job_excel_file_path = 'jobs_output.xlsx'
education_excel_file_path = 'education_output.xlsx'
skills_excel_file_path = 'N_F_skill_output.xlsx'
# خواندن شغل‌ها، تحصیلات و مهارت‌ها از فایل‌های اکسل
jobs = read_from_excel(job_excel_file_path)
education = read_from_excel(education_excel_file_path)
skills = read_from_excel(skills_excel_file_path)
# متن ثابت
fixed_text = """استخدام کارآموز هوش مصنوعی (AI-شیراز)"""
input_text = input_text.replace("آدرس", "")
# استخراج شغل‌ها، تحصیلات و مهارت‌ها از متن‌ها
jobs_in_fixed_text = extract_items_in_text(fixed_text, jobs)
jobs_in_input_text = extract_items_in_text(input_text, jobs)
education_in_fixed_text = extract_items_in_text(fixed_text, education)
education_in_input_text = extract_items_in_text(input_text, education)
skills_in_fixed_text = extract_items_in_text(fixed_text, skills)
skills_in_input_text = extract_items_in_text(input_text, skills)
# مقایسه و نمره‌دهی
job_score, common_jobs = compare_items(jobs_in_fixed_text, jobs_in_input_text)
education_score, common_education = compare_items(education_in_fixed_text, education_in_input_text)
skill_score, common_skills = compare_skills(skills_in_fixed_text, skills_in_input_text)
# تنظیم و آماده‌سازی مدل NER
model_name_or_path = "HooshvareLab/distilbert-fa-zwnj-base-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path) # Pytorch
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# استخراج اطلاعات NER
full_name, loc, age = extract_ner_info(input_text, nlp)
# نمره‌دهی لوکیشن
fixed_loc = "شیراز"
loc_score = 100 if loc == fixed_loc else 0
# نمره‌دهی سن
age_score = 100 if age and 18 <= age <= 30 else 0
# محاسبه و نمایش میانگین نمرات
average_score = (job_score + education_score + skill_score + loc_score + age_score) / 5
# ساخت خروجی JSON
output = {
"average_score": average_score,
"full_name": full_name,
"age": age,
"location": loc,
"job_score": job_score,
"education_score": education_score,
"skill_score": skill_score,
"loc_score": loc_score,
"age_score": age_score,
"common_jobs": list(common_jobs),
"common_education": list(common_education),
"common_skills": list(common_skills)
}
return json.dumps(output, ensure_ascii=False, indent=4)
iface = gr.Interface(
fn=process_text,
inputs=gr.inputs.Textbox(lines=10, placeholder="لطفاً متن خود را وارد کنید..."),
outputs="json",
title="متن پرداز",
description="این ابزار متن شما را پردازش کرده و امتیازات مشابهت را محاسبه می‌کند."
)
if __name__ == "__main__":
iface.launch()