palegre commited on
Commit
b19c8bc
·
0 Parent(s):

Add application file beta.

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # %%
3
+ import os
4
+ from time import sleep
5
+
6
+ from haystack.document_stores import ElasticsearchDocumentStore
7
+ from haystack.utils import launch_es
8
+
9
+ launch_es()
10
+ sleep(30)
11
+ # %%
12
+ os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
13
+ document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
14
+ # %%
15
+ import pandas as pd
16
+
17
+ df_document = pd.read_csv("data/articles.csv")
18
+ df_document.head()
19
+ # %%
20
+ articles = []
21
+ for idx, row in df_document.iterrows():
22
+ article = {
23
+ "id": idx,
24
+ "content": row["article"],
25
+ "meta":{
26
+ "chapter_name": row["chapter_name"],
27
+ "article_page": row["article_page"],
28
+ "article_number": row["article_number"],
29
+ "article_name": row["article_name"],
30
+ },
31
+ }
32
+ articles.append(article)
33
+
34
+ document_store.write_documents(articles, index="document")
35
+ print(f"Loaded {document_store.get_document_count()} documents")
36
+ # %%
37
+ from haystack.nodes import BM25Retriever
38
+
39
+ retriever = BM25Retriever(document_store=document_store)
40
+ # %%
41
+ from haystack.nodes import FARMReader
42
+
43
+ model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
44
+ reader = FARMReader(
45
+ model_name_or_path=model_ckpt,
46
+ progress_bar=False,
47
+ max_seq_len=384,
48
+ doc_stride=128,
49
+ return_no_answer=False,
50
+ use_gpu=False,
51
+ )
52
+ # %%
53
+ from haystack.pipelines import ExtractiveQAPipeline
54
+
55
+ pipe = ExtractiveQAPipeline(reader, retriever)
56
+ # %%
57
+ from textwrap import fill
58
+
59
+
60
+ def run_qa_pipeline(question):
61
+ results = pipe.run(
62
+ query=question,
63
+ params={
64
+ "Retriever": {"top_k": 10},
65
+ "Reader": {"top_k": 5}
66
+ }
67
+ )
68
+ return results
69
+
70
+ def results_as_markdown(results):
71
+ top_answers = []
72
+ for count, result in enumerate(results["answers"]):
73
+ article = document_store.get_document_by_id(result.document_id)
74
+ meta = result.meta
75
+ formatted_answer = """**Capítulo: {}.\t número: {}.\t nombre: {}.\t página: {}.**
76
+ {}
77
+ """.format(
78
+ meta["chapter_name"],
79
+ meta["article_number"],
80
+ meta["article_name"],
81
+ meta["article_page"],
82
+ fill(article.content, 80),
83
+ )
84
+ top_answers.append(formatted_answer)
85
+
86
+ return "\n\n".join(top_answers)
87
+
88
+ def query_qa_pipeline(question):
89
+ results = run_qa_pipeline(question)
90
+ return results_as_markdown(results)
91
+
92
+ # %%
93
+ import gradio as gr
94
+
95
+ title = "**CONSOLIDADO NORMAS APROBADAS PARA LA PROPUESTA CONSTITUCIONAL POR EL PLENO DE LA CONVENCIÓN**"
96
+ default_question = "educación gratuita"
97
+
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown(title)
100
+ with gr.Column():
101
+ with gr.Row():
102
+ question = gr.Textbox(lines=2, max_lines=3, label="Pregunta:", placeholder=default_question)
103
+ with gr.Row():
104
+ btn = gr.Button("Buscar")
105
+ with gr.Row():
106
+ answers = gr.Markdown()
107
+ btn.click(
108
+ fn=query_qa_pipeline,
109
+ inputs=question,
110
+ outputs=answers,
111
+ )
112
+
113
+ demo.launch(share=True)
114
+
115
+ # %%
data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be2629a7708b19a9eeadb6d416e7c761cfeb483531a992706d2c732894468b18
3
+ size 1469444
data/articles.csv ADDED
The diff for this file is too large to render. See raw diff
 
pdf_to_text.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import re
3
+
4
+ import fitz
5
+ import pandas as pd
6
+
7
+ # %%
8
+ document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"
9
+
10
+ # %%
11
+ skip_header_offset = 1
12
+ regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))")
13
+ regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
14
+ # %%
15
+ document = ""
16
+ page_article = {}
17
+ pdf_page_offset = 1
18
+ with fitz.open(document_path) as doc:
19
+ for page_idx, page in enumerate(doc, pdf_page_offset):
20
+ text = page.get_text()
21
+ document += text
22
+ articles = regex_article.findall(text)
23
+ for article in articles:
24
+ page_article[article] = page_idx
25
+
26
+ len(page_article)
27
+ # %%
28
+ chapters = {}
29
+ chapter_name = "header"
30
+ splited_chapters = regex_chapters.split(document)
31
+ for chapter in splited_chapters[skip_header_offset:]:
32
+ if chapter.startswith("CAPÍTULO"):
33
+ chapter_name = chapter.replace(" \n", ": ")
34
+ else:
35
+ chapters[chapter_name] = chapter
36
+ len(chapters), chapters.keys()
37
+ # %%
38
+ minimum_article_length = 65
39
+ def format_article(article):
40
+ articles = article.lstrip('- ').split("\n \n")
41
+ formated_articles = []
42
+ for article in articles:
43
+ formated_article = article.replace("\n", "").replace("*", "").strip()
44
+ is_article_single = formated_article.startswith("El Estado")
45
+ is_article_too_short = len(formated_article) <= minimum_article_length
46
+
47
+ if is_article_too_short and not is_article_single:
48
+ continue
49
+
50
+ formated_articles.append(formated_article)
51
+
52
+ sentence = " ".join(formated_articles)
53
+ return sentence
54
+
55
+ # %%
56
+ chapter_articles = []
57
+ for chapter_name, chapter in chapters.items():
58
+ article_name = "header"
59
+ splited_articles = regex_article.split(chapter)
60
+ for article in splited_articles[skip_header_offset:]:
61
+ if regex_article.match(article):
62
+ article_name = article
63
+ continue
64
+
65
+ data = {
66
+ "chapter_name": chapter_name,
67
+ "article_page": page_article.get(article_name),
68
+ "article_name": article_name,
69
+ "article": format_article(article),
70
+ }
71
+ chapter_articles.append(data)
72
+ # %%
73
+ df_document = pd.DataFrame.from_dict(chapter_articles)
74
+
75
+ df_document["article_number"] = (
76
+ df_document['article_name']
77
+ .str.extract(r'(^\d+)', expand=False)
78
+ )
79
+ df_document["article_name"] = (
80
+ df_document['article_name']
81
+ .str.extract(r'^\d+\.- ?(.*)', expand=False)
82
+ .str.rstrip(".-")
83
+ )
84
+ df_document.head()
85
+ # %%
86
+ df_document.to_csv("data/articles.csv", index=False)
87
+ # %%
qa_pipeline_faiss.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from haystack.document_stores import FAISSDocumentStore
3
+
4
+
5
+ document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
6
+ # %%
7
+ import pandas as pd
8
+
9
+ df_document = pd.read_csv("data/articles.csv")
10
+
11
+ articles = []
12
+ for idx, row in df_document.iterrows():
13
+ article = {
14
+ "content": row["article"],
15
+ "meta":{
16
+ "chapter_name": row["chapter_name"],
17
+ "article_page": row["article_page"],
18
+ "article_number": row["article_number"],
19
+ "article_name": row["article_name"],
20
+ },
21
+ }
22
+ articles.append(article)
23
+
24
+ document_store.write_documents(articles, index="document")
25
+ print(f"Loaded {document_store.get_document_count()} documents")
26
+ # %%
27
+ from haystack.nodes import DensePassageRetriever
28
+
29
+ retriever = DensePassageRetriever(
30
+ document_store=document_store,
31
+ query_embedding_model="sadakmed/dpr-passage_encoder-spanish",
32
+ passage_embedding_model="sadakmed/dpr-passage_encoder-spanish",
33
+ max_seq_len_query=64,
34
+ max_seq_len_passage=384,
35
+ batch_size=16,
36
+ use_gpu=False,
37
+ embed_title=True,
38
+ use_fast_tokenizers=True,
39
+ )
40
+ document_store.update_embeddings(retriever)
41
+ # %%
42
+ from haystack.nodes import FARMReader
43
+
44
+ model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
45
+ reader = FARMReader(
46
+ model_name_or_path=model_ckpt,
47
+ progress_bar=False,
48
+ max_seq_len=384,
49
+ doc_stride=128,
50
+ return_no_answer=True,
51
+ use_gpu=False,
52
+ )
53
+ # %%
54
+ from haystack.pipelines import ExtractiveQAPipeline
55
+
56
+ pipe = ExtractiveQAPipeline(reader, retriever)
57
+ # %%
58
+ question = "pueblos originarios justicia"
59
+ prediction = pipe.run(
60
+ query=question,
61
+ params={
62
+ "Retriever": {"top_k": 10},
63
+ "Reader": {"top_k": 5}
64
+ }
65
+ )
66
+ # %%
67
+ from pprint import pprint
68
+
69
+ pprint(prediction)
70
+
71
+ # %%
72
+ from haystack.utils import print_answers
73
+
74
+
75
+ print_answers(prediction, details="minimum")
76
+ # %%
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ PyMuPDF
2
+ haystack
3
+ pandas
4
+ gradio