Mia2024 commited on
Commit
51ba203
·
verified ·
1 Parent(s): f4aec45

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +33 -22
utils.py CHANGED
@@ -10,31 +10,42 @@ model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
10
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
11
 
12
  def clean_generated_text(text):
13
- # Basic cleaning
14
- text = re.sub(r'^(Re:|Fwd:)', '', text) # Remove reply and forward marks
15
- text = re.sub(r'Best regards,.*$', '', text, flags=re.DOTALL) # Remove everything after signature
16
- text = re.sub(r'PHONE.*$', '', text, flags=re.DOTALL) # Remove everything after phone numbers
17
- text = re.sub(r'Email:.*$', '', text, flags=re.DOTALL) # Remove everything after email addresses
18
- text = re.sub(r'Cc:.*$', '', text, flags=re.DOTALL) # Remove CC list
19
- text = re.sub(r'\* Attachments:.*', '', text, flags=re.S) # Remove 'Attachments:' and everything following it
20
- text = re.sub(r'©️ .*$', '', text, flags=re.DOTALL) # Remove copyright and ownership statements
21
- text = re.sub(r'URL If this message is not displaying properly, click here.*$', '', text, flags=re.DOTALL) # Remove error display message and links
22
- text = re.sub(r'\d{5,}', 'NUMBER', text) # Replace long sequences of numbers, likely phone numbers or ZIP codes
 
 
 
 
 
 
 
 
 
23
  return text.strip()
24
 
25
  def generate_email(product, gender, profession, hobby):
26
  input_text = f"{product} {gender} {profession} {hobby}"
27
  result = generator(
28
- input_text, # Initial text to prompt the model. Sets the context or topic for text generation.
29
- max_length=256, # Maximum length of the generated text in tokens, limiting the output size.
30
- do_sample=True, # Enables stochastic sampling; the model can generate diverse outputs at each step.
31
- top_k=20, # Limits the vocabulary considered at each step to the top-k most likely next words.
32
- top_p=0.6, # Uses nucleus sampling: Narrows down to the smallest set of words totaling 60% of the likelihood.
33
- temperature=0.4, # Scales logits before sampling to reduce randomness and produce more deterministic output.
34
- repetition_penalty=1.5, # Penalizes words that were already mentioned, reducing repetition in the text.
35
- # truncation=True, # Truncates the output to the maximum length if it exceeds it.
36
- num_return_sequences=3 # Generates three different sequences to choose from, enhancing output variety.
37
- )
38
- # Select the best output from the generated sequences
39
- best_text = sorted([clean_generated_text(r['generated_text']) for r in result], key=len)[-1]
 
40
  return best_text
 
 
10
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
11
 
12
  def clean_generated_text(text):
13
+ #Basic cleaning
14
+ text = re.sub(r'^(Re:|Fwd:)', '', text) # Remove reply and forward marks
15
+ text = re.sub(r'Best regards,.*$', '', text, flags=re.DOTALL) # Remove signature
16
+ text = re.sub(r'PHONE.*$', '', text, flags=re.DOTALL) # Remove phone numbers
17
+ text = re.sub(r'Email:.*$', '', text, flags=re.DOTALL) # Remove email addresses
18
+ text = re.sub(r'Cc:.*$', '', text, flags=re.DOTALL) # Remove CC list
19
+ text = re.sub(r'\* Attachments:.*', '', text, flags=re.S) # Remove Attachments
20
+ text = re.sub(r'©️ .*$', '', text, flags=re.DOTALL) # Remove copyright and ownership statements
21
+ text = re.sub(r'URL', '', text) # Remove URLs
22
+ text = re.sub(r'NUMBER', '10', text) # Replace 'NUMBER' with a real number
23
+ text = re.sub(r'CURRENCYNUMBER', 'USD 100', text) # Replace 'CURRENCYNUMBER' with a real value
24
+ text = re.sub(r'About Us.*', '', text, flags=re.DOTALL) # Remove 'About Us' and all following text
25
+ text = re.sub(r'\d+ [^\s]+ St\.?,?.*?\d{5}', '', text) # Remove street
26
+ text = re.sub(r'\d+ [^\s]+ Ave\.?,?.*?\d{5}', '', text) # Remove avenues
27
+ text = re.sub(r'\d+ [^\s]+ Rd\.?,?.*?\d{5}', '', text) # Remove roads
28
+ text = re.sub(r'\d+ [^\s]+ Ln\.?,?.*?\d{5}', '', text) # Remove lanes
29
+ text = re.sub(r'\d+ [^\s]+ Blvd\.?,?.*?\d{5}', '', text) # Remove boulevards
30
+ text = re.sub(r'\d+ [^\s]+ Dr\.?,?.*?\d{5}', '', text) # Remove drives
31
+ text = re.sub(r'\d+ [^\s]+ Ct\.?,?.*?\d{5}', '', text) # Remove courts
32
  return text.strip()
33
 
34
  def generate_email(product, gender, profession, hobby):
35
  input_text = f"{product} {gender} {profession} {hobby}"
36
  result = generator(
37
+ input_text, # The starting text that guides the model on what to generate
38
+ max_length=256, # Set a suitable maximum length
39
+ top_k=40, # Consider more top options words
40
+ top_p=0.6, # Control the probability range for word choices
41
+ temperature=0.4, # Control the randomness of generation
42
+ repetition_penalty=1.5, # Reduce content repetition
43
+ num_return_sequences=2, # Generate three texts
44
+ do_sample=True
45
+ )
46
+ # Clean each generated text
47
+ cleaned_texts = [clean_generated_text(seq['generated_text']) for seq in result]
48
+ # Choose the best text based on length and clarity
49
+ best_text = max(cleaned_texts, key=len)
50
  return best_text
51
+