Spaces:

cdcvd
/

resume_API

Sleeping

File size: 6,258 Bytes

7b059c6

import pandas as pd
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import gradio as gr

def read_from_excel(file_path):
    df = pd.read_excel(file_path)
    items = df['object'].astype(str).tolist()  # تبدیل همه مقادیر به رشته
    return items

def preprocess_text(text):
    # حذف کاراکترهای غیرضروری و نرمال‌سازی متن
    text = text.replace('\u200c', ' ').strip()  # حذف نیم‌فاصله و فاصله‌های اضافی
    text = re.sub(r'\s+', ' ', text)  # حذف فاصله‌های تکراری
    return text

def extract_items_in_text(text, items):
    text = preprocess_text(text)
    found_items = set()  # استفاده از مجموعه برای جلوگیری از تکرار
    for item in items:
        item_normalized = preprocess_text(item)
        if item_normalized.lower() in text.lower():
            found_items.add(item_normalized)
    return list(found_items)

def compare_items(items_1, items_2):
    common_items = set()
    score = 0  # مقدار پیش‌فرض برای score
    for item1 in items_1:
        for item2 in items_2:
            words1 = set(item1.lower().split())
            words2 = set(item2.lower().split())
            common_words = words1.intersection(words2)
            num_common = len(common_words)
            
            if num_common >= 3:
                common_items.add((item1, item2))
                score = 100
            elif num_common == 2:
                common_items.add((item1, item2))
                score = 75
            elif num_common == 1:
                common_items.add((item1, item2))
                score = 50
    
    return score, common_items

def compare_skills(skill_1, skill_2):
    common_skill = set(skill_1).intersection(set(skill_2))
    num_common = len(common_skill)
    
    if num_common >= 10:
        score = 100
    elif num_common == 7:
        score = 75
    elif num_common == 5:
        score = 50
    else:
        score = 25
    
    return score, common_skill

def extract_ner_info(text, nlp):
    ner_results = nlp(text)
    full_name = ''
    loc = ''
    age = None

    for i in range(len(ner_results)):
        if ner_results[i]['entity'] == 'B-PER':
            full_name = ner_results[i]['word']
            for j in range(i+1, len(ner_results)):
                if ner_results[j]['entity'].startswith('I-PER'):
                    full_name += ner_results[j]['word'].replace('##', '')
                else:
                    break
        
        if ner_results[i]['entity'] == 'B-LOC' and not loc:
            loc = ner_results[i]['word']

    age_match = re.search(r'سن\s*:\s*(\d+)', text)
    if age_match:
        age = int(age_match.group(1))
    
    return full_name, loc, age

def process_text(input_text):
    # مسیر فایل اکسل‌ها را وارد کنید
    job_excel_file_path = 'jobs_output.xlsx'
    education_excel_file_path = 'education_output.xlsx'
    skills_excel_file_path = 'N_F_skill_output.xlsx'
    # خواندن شغل‌ها، تحصیلات و مهارت‌ها از فایل‌های اکسل
    jobs = read_from_excel(job_excel_file_path)
    education = read_from_excel(education_excel_file_path)
    skills = read_from_excel(skills_excel_file_path)
    
    # متن ثابت
    fixed_text = """استخدام کارآموز هوش مصنوعی (AI-شیراز)"""
    
    input_text = input_text.replace("آدرس", "")
    # استخراج شغل‌ها، تحصیلات و مهارت‌ها از متن‌ها
    jobs_in_fixed_text = extract_items_in_text(fixed_text, jobs)
    jobs_in_input_text = extract_items_in_text(input_text, jobs)
    education_in_fixed_text = extract_items_in_text(fixed_text, education)
    education_in_input_text = extract_items_in_text(input_text, education)
    skills_in_fixed_text = extract_items_in_text(fixed_text, skills)
    skills_in_input_text = extract_items_in_text(input_text, skills)
    
    # مقایسه و نمره‌دهی
    job_score, common_jobs = compare_items(jobs_in_fixed_text, jobs_in_input_text)
    education_score, common_education = compare_items(education_in_fixed_text, education_in_input_text)
    skill_score, common_skills = compare_skills(skills_in_fixed_text, skills_in_input_text)

    # تنظیم و آماده‌سازی مدل NER
    model_name_or_path = "HooshvareLab/distilbert-fa-zwnj-base-ner"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Pytorch
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    
    # استخراج اطلاعات NER
    full_name, loc, age = extract_ner_info(input_text, nlp)
    
    # نمره‌دهی لوکیشن
    fixed_loc = "شیراز"
    loc_score = 100 if loc == fixed_loc else 0

    # نمره‌دهی سن
    age_score = 100 if age and 18 <= age <= 30 else 0
    
    # محاسبه و نمایش میانگین نمرات
    average_score = (job_score + education_score + skill_score + loc_score + age_score) / 5
    
    # ساخت خروجی JSON
    output = {
        "average_score": average_score,
        "full_name": full_name,
        "age": age,
        "location": loc,
        "job_score": job_score,
        "education_score": education_score,
        "skill_score": skill_score,
        "loc_score": loc_score,
        "age_score": age_score,
        "common_jobs": list(common_jobs),
        "common_education": list(common_education),
        "common_skills": list(common_skills)
    }

    return json.dumps(output, ensure_ascii=False, indent=4)

iface = gr.Interface(
    fn=process_text,
    inputs=gr.inputs.Textbox(lines=10, placeholder="لطفاً متن خود را وارد کنید..."),
    outputs="json",
    title="متن پرداز",
    description="این ابزار متن شما را پردازش کرده و امتیازات مشابهت را محاسبه می‌کند."
)

if __name__ == "__main__":
    iface.launch()