Adrien
commited on
Commit
·
bccb279
1
Parent(s):
3823e3e
clean up
Browse files
rag_demo/rag/source_annotator.py
CHANGED
@@ -11,6 +11,7 @@ from transformers import pipeline
|
|
11 |
|
12 |
class SourceAnnotator:
|
13 |
def __init__(self):
|
|
|
14 |
self.source_annotator = pipeline(
|
15 |
"question-answering",
|
16 |
model="distilbert/distilbert-base-cased-distilled-squad",
|
@@ -22,7 +23,7 @@ class SourceAnnotator:
|
|
22 |
for sentence in sentences:
|
23 |
scores = []
|
24 |
for chunk in reranked_chunks:
|
25 |
-
score = self.
|
26 |
score["filename"] = chunk.metadata["filename"].split(".pdf")[0]
|
27 |
score["chunk_id"] = chunk.chunk_id
|
28 |
scores.append(score)
|
@@ -38,6 +39,3 @@ class SourceAnnotator:
|
|
38 |
pattern = r"(?<=[.!?])\s+(?=[A-Z])"
|
39 |
sentences = re.split(pattern, text)
|
40 |
return [s.strip() for s in sentences if s.strip()]
|
41 |
-
|
42 |
-
def annotate_source(self, text: str, chunk: str) -> dict:
|
43 |
-
return self.source_annotator(text, chunk)
|
|
|
11 |
|
12 |
class SourceAnnotator:
|
13 |
def __init__(self):
|
14 |
+
# Extractive question answering model
|
15 |
self.source_annotator = pipeline(
|
16 |
"question-answering",
|
17 |
model="distilbert/distilbert-base-cased-distilled-squad",
|
|
|
23 |
for sentence in sentences:
|
24 |
scores = []
|
25 |
for chunk in reranked_chunks:
|
26 |
+
score = self.source_annotator(sentence.lower(), chunk.content.lower())
|
27 |
score["filename"] = chunk.metadata["filename"].split(".pdf")[0]
|
28 |
score["chunk_id"] = chunk.chunk_id
|
29 |
scores.append(score)
|
|
|
39 |
pattern = r"(?<=[.!?])\s+(?=[A-Z])"
|
40 |
sentences = re.split(pattern, text)
|
41 |
return [s.strip() for s in sentences if s.strip()]
|
|
|
|
|
|