Spaces:
Sleeping
Sleeping
Update modules/extractive.py
Browse files- modules/extractive.py +148 -19
modules/extractive.py
CHANGED
@@ -1,47 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
|
|
|
|
2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
from summarizer import Summarizer
|
5 |
-
import networkx as nx
|
6 |
|
7 |
|
8 |
class TFIDFSummarizer:
|
|
|
|
|
9 |
@staticmethod
|
10 |
-
def summarize(sentences, preprocessed_sentences, num_sentences):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
vectorizer = TfidfVectorizer()
|
12 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
13 |
-
|
14 |
-
ranked_indices = np.argsort(
|
15 |
-
|
|
|
16 |
|
17 |
|
18 |
class TextRankSummarizer:
|
|
|
|
|
19 |
@staticmethod
|
20 |
-
def summarize(sentences, preprocessed_sentences, num_sentences):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
vectorizer = TfidfVectorizer()
|
22 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
23 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
24 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
25 |
scores = nx.pagerank(nx_graph)
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
|
30 |
class CombinedSummarizer:
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
)
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
class BERTSummarizer:
|
|
|
|
|
43 |
def __init__(self):
|
|
|
|
|
|
|
|
|
44 |
self.model = Summarizer()
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# abstractive.py
|
2 |
+
|
3 |
+
"""
|
4 |
+
Módulo de resúmenes 'abstractive.py'
|
5 |
+
|
6 |
+
Contiene implementaciones de diferentes técnicas de resumen de texto:
|
7 |
+
- TF-IDF Summarizer
|
8 |
+
- TextRank Summarizer
|
9 |
+
- Combined Summarizer (que combina TF-IDF y TextRank para extraer palabras clave)
|
10 |
+
- BERT Summarizer (extractivo basado en un modelo BERT preentrenado)
|
11 |
+
"""
|
12 |
+
|
13 |
import numpy as np
|
14 |
+
import networkx as nx
|
15 |
+
from typing import List
|
16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
from sklearn.metrics.pairwise import cosine_similarity
|
18 |
from summarizer import Summarizer
|
|
|
19 |
|
20 |
|
21 |
class TFIDFSummarizer:
|
22 |
+
"""Genera resúmenes usando el modelo TF-IDF."""
|
23 |
+
|
24 |
@staticmethod
|
25 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
26 |
+
"""
|
27 |
+
Genera un resumen basado en TF-IDF seleccionando las oraciones mejor puntuadas.
|
28 |
+
|
29 |
+
:param sentences: Lista de oraciones originales (sin procesar).
|
30 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas (por ejemplo, tokenizadas o normalizadas).
|
31 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
32 |
+
:return: Un string que contiene el resumen formado por las oraciones más relevantes.
|
33 |
+
"""
|
34 |
vectorizer = TfidfVectorizer()
|
35 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
36 |
+
sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)
|
37 |
+
ranked_indices = np.argsort(sentence_scores)[::-1]
|
38 |
+
selected = [sentences[i] for i in ranked_indices[:num_sentences]]
|
39 |
+
return ' '.join(selected)
|
40 |
|
41 |
|
42 |
class TextRankSummarizer:
|
43 |
+
"""Genera resúmenes usando el algoritmo TextRank."""
|
44 |
+
|
45 |
@staticmethod
|
46 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
47 |
+
"""
|
48 |
+
Genera un resumen usando el algoritmo de grafos TextRank.
|
49 |
+
|
50 |
+
:param sentences: Lista de oraciones originales.
|
51 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
52 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
53 |
+
:return: Un string que contiene el resumen.
|
54 |
+
"""
|
55 |
vectorizer = TfidfVectorizer()
|
56 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
57 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
58 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
59 |
scores = nx.pagerank(nx_graph)
|
60 |
+
# Ordena los nodos (oraciones) por puntaje descendente
|
61 |
+
ranked_indices = sorted(((scores[node], node) for node in nx_graph.nodes), reverse=True)
|
62 |
+
selected = [sentences[i] for _, i in ranked_indices[:num_sentences]]
|
63 |
+
return ' '.join(selected)
|
64 |
|
65 |
|
66 |
class CombinedSummarizer:
|
67 |
+
"""Genera resúmenes combinando palabras clave TF-IDF y TextRank."""
|
68 |
+
|
69 |
+
def __init__(self, top_n_keywords: int = 10):
|
70 |
+
"""
|
71 |
+
:param top_n_keywords: Número de palabras clave a extraer de cada método (TF-IDF y TextRank).
|
72 |
+
"""
|
73 |
+
self.top_n_keywords = top_n_keywords
|
74 |
+
|
75 |
+
def extract_keywords_tfidf(self, preprocessed_sentences: List[str]) -> List[str]:
|
76 |
+
"""
|
77 |
+
Extrae palabras clave basadas en TF-IDF.
|
78 |
+
|
79 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
80 |
+
:return: Lista con las palabras clave más relevantes según TF-IDF.
|
81 |
+
"""
|
82 |
+
vectorizer = TfidfVectorizer()
|
83 |
+
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
84 |
+
tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray().sum(axis=0))
|
85 |
+
sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
|
86 |
+
return [word for word, _ in sorted_scores[:self.top_n_keywords]]
|
87 |
+
|
88 |
+
def extract_keywords_textrank(self, preprocessed_sentences: List[str]) -> List[str]:
|
89 |
+
"""
|
90 |
+
Extrae palabras clave basadas en TextRank (a través de la co-ocurrencia de palabras).
|
91 |
+
|
92 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
93 |
+
:return: Lista con las palabras clave más relevantes según TextRank.
|
94 |
+
"""
|
95 |
+
words = ' '.join(preprocessed_sentences).split()
|
96 |
+
co_occurrence_graph = nx.Graph()
|
97 |
+
for i in range(len(words) - 1):
|
98 |
+
word_pair = (words[i], words[i + 1])
|
99 |
+
if co_occurrence_graph.has_edge(*word_pair):
|
100 |
+
co_occurrence_graph[word_pair[0]][word_pair[1]]['weight'] += 1
|
101 |
+
else:
|
102 |
+
co_occurrence_graph.add_edge(word_pair[0], word_pair[1], weight=1)
|
103 |
+
|
104 |
+
ranks = nx.pagerank(co_occurrence_graph, weight='weight')
|
105 |
+
sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
|
106 |
+
return [word for word, _ in sorted_ranks[:self.top_n_keywords]]
|
107 |
+
|
108 |
+
def combined_keywords(self, preprocessed_sentences: List[str]) -> List[str]:
|
109 |
+
"""
|
110 |
+
Combina las palabras clave obtenidas tanto por TF-IDF como por TextRank
|
111 |
+
y devuelve la intersección de ambas listas.
|
112 |
+
|
113 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
114 |
+
:return: Lista con las palabras clave en común (intersección).
|
115 |
+
"""
|
116 |
+
tfidf_keywords = self.extract_keywords_tfidf(preprocessed_sentences)
|
117 |
+
textrank_keywords = self.extract_keywords_textrank(preprocessed_sentences)
|
118 |
+
return list(set(tfidf_keywords) & set(textrank_keywords))
|
119 |
+
|
120 |
+
def summarize(self, sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
121 |
+
"""
|
122 |
+
Genera un resumen basado en la frecuencia de palabras clave combinadas (TF-IDF & TextRank).
|
123 |
+
|
124 |
+
:param sentences: Lista de oraciones originales.
|
125 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
126 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
127 |
+
:return: Un string con las oraciones más relevantes.
|
128 |
+
"""
|
129 |
+
keywords = self.combined_keywords(preprocessed_sentences)
|
130 |
+
sentence_scores = []
|
131 |
+
for i, sentence in enumerate(preprocessed_sentences):
|
132 |
+
score = sum(1 for word in sentence.split() if word in keywords)
|
133 |
+
sentence_scores.append((score, i))
|
134 |
+
# Ordena las oraciones por la cantidad de palabras clave presentes
|
135 |
+
ranked_sentences = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
|
136 |
+
selected = [sentences[i] for _, i in ranked_sentences[:num_sentences]]
|
137 |
+
return ' '.join(selected)
|
138 |
|
139 |
|
140 |
class BERTSummarizer:
|
141 |
+
"""Genera resúmenes usando un modelo BERT extractivo preentrenado."""
|
142 |
+
|
143 |
def __init__(self):
|
144 |
+
"""
|
145 |
+
Inicializa el modelo BERT extractivo a través de la clase Summarizer.
|
146 |
+
Asegúrate de instalar e importar correctamente la librería 'bert-extractive-summarizer'.
|
147 |
+
"""
|
148 |
self.model = Summarizer()
|
149 |
|
150 |
+
@staticmethod
|
151 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
152 |
+
"""
|
153 |
+
Genera un resumen extractivo usando un modelo BERT preentrenado basado en oraciones.
|
154 |
+
|
155 |
+
:param sentences: Lista de oraciones originales (sin procesar).
|
156 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
157 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
158 |
+
:return: Un string que contiene el resumen formado por las oraciones más relevantes.
|
159 |
+
"""
|
160 |
+
# Unimos las oraciones preprocesadas en un texto completo para resumir
|
161 |
+
text = ' '.join(preprocessed_sentences)
|
162 |
+
summarizer = Summarizer()
|
163 |
+
|
164 |
+
# Usamos el modelo para generar un resumen con el número de oraciones solicitado
|
165 |
+
summarized_text = summarizer(text, num_sentences=num_sentences)
|
166 |
+
|
167 |
+
# Dividimos el resumen generado para seleccionar las oraciones originales correspondientes
|
168 |
+
summarized_sentences = summarized_text.split('. ')
|
169 |
+
selected = []
|
170 |
+
for summarized_sentence in summarized_sentences:
|
171 |
+
for sentence in sentences:
|
172 |
+
if summarized_sentence.strip() in sentence:
|
173 |
+
selected.append(sentence)
|
174 |
+
break
|
175 |
+
return ' '.join(selected[:num_sentences])
|
176 |
+
|