pszemraj commited on
Commit
471b053
·
1 Parent(s): e219aa1

🚸 kw based file naming

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. utils.py +64 -5
utils.py CHANGED
@@ -9,6 +9,12 @@ from pathlib import Path
9
 
10
  import torch
11
  from natsort import natsorted
 
 
 
 
 
 
12
 
13
 
14
  def validate_pytorch2(torch_version: str = None):
@@ -88,6 +94,57 @@ def load_example_filenames(example_path: str or Path):
88
  return examples
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def saves_summary(
92
  summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
93
  ):
@@ -99,16 +156,18 @@ def saves_summary(
99
  add_signature: whether to add a signature to the output file
100
  kwargs: additional keyword arguments to include in the output file
101
  """
102
- outpath = (
103
- Path.cwd() / f"document_summary_{get_timestamp()}.txt"
104
- if outpath is None
105
- else Path(outpath)
106
- )
107
  sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
108
  sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
109
  scores_text = "\n".join(sum_scores)
110
  full_summary = "\n".join(sum_text)
111
 
 
 
 
 
 
 
 
112
  with open(
113
  outpath,
114
  "w",
 
9
 
10
  import torch
11
  from natsort import natsorted
12
+ from typing import List
13
+ from nltk.tokenize import sent_tokenize, word_tokenize
14
+ from itertools import combinations
15
+ from collections import defaultdict
16
+ from rapidfuzz import fuzz
17
+ from nltk.corpus import stopwords
18
 
19
 
20
  def validate_pytorch2(torch_version: str = None):
 
94
  return examples
95
 
96
 
97
+ def extract_keywords(text: str, num_keywords: int = 3) -> List[str]:
98
+ """
99
+ Extracts keywords from a text using the TextRank algorithm.
100
+
101
+ Args:
102
+ text: The text to extract keywords from.
103
+ num_keywords: The number of keywords to extract. Default is 5.
104
+
105
+ Returns:
106
+ A list of strings, where each string is a keyword extracted from the input text.
107
+ """
108
+ # Remove stopwords from the input text
109
+ stop_words = set(stopwords.words("english"))
110
+ text = " ".join([word for word in text.lower().split() if word not in stop_words])
111
+
112
+ # Tokenize the text into sentences and words
113
+ sentences = sent_tokenize(text)
114
+ words = [word_tokenize(sentence) for sentence in sentences]
115
+
116
+ # Filter out words that are shorter than 3 characters
117
+ words = [[word for word in sentence if len(word) >= 3] for sentence in words]
118
+
119
+ # Create a graph of word co-occurrences
120
+ cooccur = defaultdict(lambda: defaultdict(int))
121
+ for sentence in words:
122
+ for w1, w2 in combinations(sentence, 2):
123
+ cooccur[w1][w2] += 1
124
+ cooccur[w2][w1] += 1
125
+
126
+ # Assign scores to words using the TextRank algorithm
127
+ scores = defaultdict(float)
128
+ for i in range(10):
129
+ for word in cooccur:
130
+ score = 0.15 + 0.85 * sum(
131
+ cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
132
+ for other in cooccur[word]
133
+ )
134
+ scores[word] = score
135
+
136
+ # Sort the words by score and return the top num_keywords keywords
137
+ keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
138
+
139
+ # Use fuzzy matching to remove similar keywords
140
+ final_keywords = []
141
+ for keyword in keywords:
142
+ if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
143
+ final_keywords.append(keyword)
144
+
145
+ return final_keywords
146
+
147
+
148
  def saves_summary(
149
  summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
150
  ):
 
156
  add_signature: whether to add a signature to the output file
157
  kwargs: additional keyword arguments to include in the output file
158
  """
 
 
 
 
 
159
  sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
160
  sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
161
  scores_text = "\n".join(sum_scores)
162
  full_summary = "\n".join(sum_text)
163
 
164
+ keywords = "_".join(extract_keywords(full_summary))
165
+ outpath = (
166
+ Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
167
+ if outpath is None
168
+ else Path(outpath)
169
+ )
170
+
171
  with open(
172
  outpath,
173
  "w",