Spaces:
Sleeping
Sleeping
File size: 6,258 Bytes
7b059c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import pandas as pd
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import gradio as gr
def read_from_excel(file_path):
df = pd.read_excel(file_path)
items = df['object'].astype(str).tolist() # تبدیل همه مقادیر به رشته
return items
def preprocess_text(text):
# حذف کاراکترهای غیرضروری و نرمالسازی متن
text = text.replace('\u200c', ' ').strip() # حذف نیمفاصله و فاصلههای اضافی
text = re.sub(r'\s+', ' ', text) # حذف فاصلههای تکراری
return text
def extract_items_in_text(text, items):
text = preprocess_text(text)
found_items = set() # استفاده از مجموعه برای جلوگیری از تکرار
for item in items:
item_normalized = preprocess_text(item)
if item_normalized.lower() in text.lower():
found_items.add(item_normalized)
return list(found_items)
def compare_items(items_1, items_2):
common_items = set()
score = 0 # مقدار پیشفرض برای score
for item1 in items_1:
for item2 in items_2:
words1 = set(item1.lower().split())
words2 = set(item2.lower().split())
common_words = words1.intersection(words2)
num_common = len(common_words)
if num_common >= 3:
common_items.add((item1, item2))
score = 100
elif num_common == 2:
common_items.add((item1, item2))
score = 75
elif num_common == 1:
common_items.add((item1, item2))
score = 50
return score, common_items
def compare_skills(skill_1, skill_2):
common_skill = set(skill_1).intersection(set(skill_2))
num_common = len(common_skill)
if num_common >= 10:
score = 100
elif num_common == 7:
score = 75
elif num_common == 5:
score = 50
else:
score = 25
return score, common_skill
def extract_ner_info(text, nlp):
ner_results = nlp(text)
full_name = ''
loc = ''
age = None
for i in range(len(ner_results)):
if ner_results[i]['entity'] == 'B-PER':
full_name = ner_results[i]['word']
for j in range(i+1, len(ner_results)):
if ner_results[j]['entity'].startswith('I-PER'):
full_name += ner_results[j]['word'].replace('##', '')
else:
break
if ner_results[i]['entity'] == 'B-LOC' and not loc:
loc = ner_results[i]['word']
age_match = re.search(r'سن\s*:\s*(\d+)', text)
if age_match:
age = int(age_match.group(1))
return full_name, loc, age
def process_text(input_text):
# مسیر فایل اکسلها را وارد کنید
job_excel_file_path = 'jobs_output.xlsx'
education_excel_file_path = 'education_output.xlsx'
skills_excel_file_path = 'N_F_skill_output.xlsx'
# خواندن شغلها، تحصیلات و مهارتها از فایلهای اکسل
jobs = read_from_excel(job_excel_file_path)
education = read_from_excel(education_excel_file_path)
skills = read_from_excel(skills_excel_file_path)
# متن ثابت
fixed_text = """استخدام کارآموز هوش مصنوعی (AI-شیراز)"""
input_text = input_text.replace("آدرس", "")
# استخراج شغلها، تحصیلات و مهارتها از متنها
jobs_in_fixed_text = extract_items_in_text(fixed_text, jobs)
jobs_in_input_text = extract_items_in_text(input_text, jobs)
education_in_fixed_text = extract_items_in_text(fixed_text, education)
education_in_input_text = extract_items_in_text(input_text, education)
skills_in_fixed_text = extract_items_in_text(fixed_text, skills)
skills_in_input_text = extract_items_in_text(input_text, skills)
# مقایسه و نمرهدهی
job_score, common_jobs = compare_items(jobs_in_fixed_text, jobs_in_input_text)
education_score, common_education = compare_items(education_in_fixed_text, education_in_input_text)
skill_score, common_skills = compare_skills(skills_in_fixed_text, skills_in_input_text)
# تنظیم و آمادهسازی مدل NER
model_name_or_path = "HooshvareLab/distilbert-fa-zwnj-base-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path) # Pytorch
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# استخراج اطلاعات NER
full_name, loc, age = extract_ner_info(input_text, nlp)
# نمرهدهی لوکیشن
fixed_loc = "شیراز"
loc_score = 100 if loc == fixed_loc else 0
# نمرهدهی سن
age_score = 100 if age and 18 <= age <= 30 else 0
# محاسبه و نمایش میانگین نمرات
average_score = (job_score + education_score + skill_score + loc_score + age_score) / 5
# ساخت خروجی JSON
output = {
"average_score": average_score,
"full_name": full_name,
"age": age,
"location": loc,
"job_score": job_score,
"education_score": education_score,
"skill_score": skill_score,
"loc_score": loc_score,
"age_score": age_score,
"common_jobs": list(common_jobs),
"common_education": list(common_education),
"common_skills": list(common_skills)
}
return json.dumps(output, ensure_ascii=False, indent=4)
iface = gr.Interface(
fn=process_text,
inputs=gr.inputs.Textbox(lines=10, placeholder="لطفاً متن خود را وارد کنید..."),
outputs="json",
title="متن پرداز",
description="این ابزار متن شما را پردازش کرده و امتیازات مشابهت را محاسبه میکند."
)
if __name__ == "__main__":
iface.launch() |