File size: 3,080 Bytes
2f0136f
 
 
 
 
 
 
 
 
 
 
 
51ba203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f0136f
 
 
 
 
51ba203
 
 
 
 
 
 
 
 
 
 
 
 
2f0136f
51ba203
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import re

# Load saved model and tokenizer
model_checkpoint = "24NLPGroupO/EmailGeneration"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, truncation=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Set up the generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def clean_generated_text(text):
    #Basic cleaning
    text = re.sub(r'^(Re:|Fwd:)', '', text)                         # Remove reply and forward marks
    text = re.sub(r'Best regards,.*$', '', text, flags=re.DOTALL)   # Remove signature
    text = re.sub(r'PHONE.*$', '', text, flags=re.DOTALL)           # Remove phone numbers
    text = re.sub(r'Email:.*$', '', text, flags=re.DOTALL)          # Remove email addresses
    text = re.sub(r'Cc:.*$', '', text, flags=re.DOTALL)             # Remove CC list
    text = re.sub(r'\* Attachments:.*', '', text, flags=re.S)       # Remove Attachments
    text = re.sub(r'©️ .*$', '', text, flags=re.DOTALL)              # Remove copyright and ownership statements
    text = re.sub(r'URL', '', text)                                 # Remove URLs
    text = re.sub(r'NUMBER', '10', text)                            # Replace 'NUMBER' with a real number
    text = re.sub(r'CURRENCYNUMBER', 'USD 100', text)               # Replace 'CURRENCYNUMBER' with a real value
    text = re.sub(r'About Us.*', '', text, flags=re.DOTALL)         # Remove 'About Us' and all following text
    text = re.sub(r'\d+ [^\s]+ St\.?,?.*?\d{5}', '', text)          # Remove street
    text = re.sub(r'\d+ [^\s]+ Ave\.?,?.*?\d{5}', '', text)         # Remove avenues
    text = re.sub(r'\d+ [^\s]+ Rd\.?,?.*?\d{5}', '', text)          # Remove roads
    text = re.sub(r'\d+ [^\s]+ Ln\.?,?.*?\d{5}', '', text)          # Remove lanes
    text = re.sub(r'\d+ [^\s]+ Blvd\.?,?.*?\d{5}', '', text)        # Remove boulevards
    text = re.sub(r'\d+ [^\s]+ Dr\.?,?.*?\d{5}', '', text)          # Remove drives
    text = re.sub(r'\d+ [^\s]+ Ct\.?,?.*?\d{5}', '', text)          # Remove courts
    return text.strip()

def generate_email(product, gender, profession, hobby):
    input_text = f"{product} {gender} {profession} {hobby}"
    result = generator(
        input_text,                # The starting text that guides the model on what to generate
        max_length=256,            # Set a suitable maximum length
        top_k=40,                  # Consider more top options words
        top_p=0.6,                 # Control the probability range for word choices
        temperature=0.4,           # Control the randomness of generation
        repetition_penalty=1.5,    # Reduce content repetition
        num_return_sequences=2,    # Generate three texts
        do_sample=True
    )       
    # Clean each generated text
    cleaned_texts = [clean_generated_text(seq['generated_text']) for seq in result]
    # Choose the best text based on length and clarity
    best_text = max(cleaned_texts, key=len)
    return best_text