Spaces:

Pennywise881
/

wiki-chat-v2

Runtime error

App Files Files Community

Pennywise881 commited on Feb 22, 2023

Commit

8644233

1 Parent(s): a70737f

Upload 4 files

Browse files

Files changed (4) hide show

Article.py +55 -0
QuestionAnswer.py +129 -0
VectorDB.py +34 -0
app.py +97 -0

Article.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import wikipediaapi
+class Article:
+    def __init__(self):
+        self.article = None
+        self.article_data = []
+        self.id_counter = 0
+    def set_summary(self):
+        if self.article.summary:
+            for text in self.article.summary.split('\n'):
+                self.id_counter += 1
+                self.article_data.append(
+                    {
+                        'id': self.id_counter,
+                        'section': 'Summary',
+                        'text': text.lower()
+                    }
+                )
+    def set_sections_and_texts(self, sections):
+        for section in sections:
+            if section.text:
+                for text in section.text.split('\n'):
+                    self.id_counter += 1
+                    self.article_data.append(
+                        {
+                            'id': self.id_counter,
+                            'section': section.title,
+                            'text': text.lower()
+                        }
+                    )
+            if len(section.sections) > 0:
+                self.set_sections_and_texts(section.sections)
+    def clean_data(self):
+        unwanted_sections = ['See also', 'External links']
+        cleaned_data = []
+        for data in self.article_data:
+            if len(data['text']) > 1 and data['section'] not in unwanted_sections:
+                cleaned_data.append(data)
+        self.article_data = cleaned_data
+    def get_article_data(self, article_name):
+        self.article = wikipediaapi.Wikipedia('en').page(article_name)
+        if not self.article.exists():
+            return []
+        else:
+            self.set_summary()
+            self.set_sections_and_texts(self.article.sections)
+            self.clean_data()
+        return self.article_data

QuestionAnswer.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import numpy as np
+# # from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+class QuestionAnswer:
+    def __init__(self, data, model, tokenizer, torch_device):
+        self.max_length = 384
+        self.doc_stride = 128
+        self.tokenizer = tokenizer
+        self.model = model
+        self.data = data
+        self.torch_device = torch_device
+        self.output = None
+        self.features = None
+        self.results = None
+    def get_output_from_model(self):
+        # data = {'question': question, 'context': context}
+        with torch.no_grad():
+            tokenized_data = self.tokenizer(
+                self.data['question'],
+                self.data['context'],
+                truncation='only_second',
+                max_length=self.max_length,
+                stride=self.doc_stride,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                padding='max_length',
+                return_tensors='pt'
+            ).to(self.torch_device)
+            output = self.model(tokenized_data['input_ids'], tokenized_data['attention_mask'])
+        return output
+        # print(output.keys())
+        # print(output['start_logits'].shape)
+        # print(output['end_logits'].shape)
+        # print(tokenized_data.keys())
+    def prepare_features(self, example):
+        tokenized_example = self.tokenizer(
+            example['question'],
+            example['context'],
+            truncation='only_second',
+            max_length=self.max_length,
+            stride=self.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding='max_length',
+        )
+        # sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
+        for i in range(len(tokenized_example['input_ids'])):
+            sequence_ids = tokenized_example.sequence_ids(i)
+            # print(sequence_ids)
+            context_index = 1
+            # sample_index = sample_mapping[i]
+            tokenized_example["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_example["offset_mapping"][i])
+            ]
+        return tokenized_example
+    def postprocess_qa_predictions(self, data, features, raw_predictions, top_n_answers=5, max_answer_length=30):
+        all_start_logits, all_end_logits = raw_predictions.start_logits, raw_predictions.end_logits
+        # print(all_start_logits)
+        results = []
+        context = data['context']
+        # print(len(features['input_ids']))
+        for i in range(len(features['input_ids'])):
+            start_logits = all_start_logits[i].cpu().numpy()
+            end_logits = all_end_logits[i].cpu().numpy()
+            # print(start_logits)
+            offset_mapping = features['offset_mapping'][i]
+            start_indices = np.argsort(start_logits)[-1: -top_n_answers - 1: -1].tolist()
+            end_indices = np.argsort(end_logits)[-1: -top_n_answers - 1: -1].tolist()
+            for start_index in start_indices:
+                for end_index in end_indices:
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                        or end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+                    start_char = offset_mapping[start_index][0]
+                    end_char = offset_mapping[end_index][1]
+                    # print(start_logits[start_index])
+                    # print(end_logits[end_index])
+                    score = start_logits[start_index] + end_logits[end_index]
+                    results.append(
+                        {
+                            'score': float('%.*g' % (3, score)),
+                            'text': context[start_char: end_char]
+                        }
+                    )
+        results = sorted(results, key=lambda x: x["score"], reverse=True)[:top_n_answers]
+        return results
+    def get_results(self):
+        self.output = self.get_output_from_model()
+        self.features = self.prepare_features(self.data)
+        self.results = self.postprocess_qa_predictions(self.data, self.features, self.output)
+        return self.results

VectorDB.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pinecone
+class VectorDB:
+    def __init__(self, retreiver, API_KEY):
+        pinecone.init(api_key=API_KEY, environment='us-east1-gcp')
+        self.retreiver = retreiver
+        if 'wikiqav2-index' not in pinecone.list_indexes():
+            pinecone.create_index(
+                name='wikiqav2-index', dimension=self.retreiver.get_sentence_embedding_dimension(), metric='cosine'
+            )
+        self.index = pinecone.Index('wikiqav2-index')
+    def upsert_data(self, article_data):
+        for i in range(len(article_data)):
+            article_data[i]['encoding'] = self.retreiver.encode(article_data[i]['text']).tolist()
+        upserts = [(str(v['id']), v['encoding'], {'text': v['text'], 'section': v['section']}) for v in article_data]
+        # index.upsert(vectors=upserts[0])
+        for i in range(0, len(upserts), 10):
+            i_end = i + 10
+            if i_end > len(upserts):
+                i_end = len(upserts)
+            self.index.upsert(vectors=upserts[i:i_end])
+    def get_contexts(self, question):
+        xq = self.retreiver.encode([question]).tolist()
+        contexts = self.index.query(xq, top_k=1, include_metadata=True)
+        return contexts

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import streamlit as st
+import os
+from Article import Article
+from VectorDB import VectorDB
+from QuestionAnswer import QuestionAnswer
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+from sentence_transformers import models, SentenceTransformer
+reader = AutoModelForQuestionAnswering.from_pretrained('Pennywise881/distilbert-base-uncased-finetuned-squad-v2')
+tokenizer = AutoTokenizer.from_pretrained('Pennywise881/distilbert-base-uncased-finetuned-squad-v2')
+distilbert = models.Transformer("Pennywise881/distilbert-base-uncased-mnr-squadv2")
+pooler = models.Pooling(
+    distilbert.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True
+)
+retreiver = SentenceTransformer(modules=[distilbert, pooler])
+if 'found_article' not in st.session_state:
+    st.session_state.found_article = False
+    st.session_state.article_name = ''
+    st.session_state.db = None
+    st.session_state.qas = []
+st.write("""
+    # Wiki Q&A V2
+""")
+placeholder = st.empty()
+def get_article(retreiver):
+    article_name = placeholder.text_input("Enter the name of a Wikipedia article")
+    if article_name:
+        article = Article()
+        article_data = article.get_article_data(article_name=article_name)
+        if len(article_data) > 0:
+            API_KEY = os.environ['API_KEY']
+            db = VectorDB(retreiver=retreiver, API_KEY=API_KEY)
+            db.upsert_data(article_data=article_data)
+            ask_questions(article_name=article_name, db=db)
+            st.session_state.found_article = True
+            st.session_state.article_name = article_name
+            st.session_state.db = db
+        else:
+            st.write(f'Sorry, could not find Wikipedia article: {article_name}')
+def ask_questions(article_name, db : VectorDB):
+    question = placeholder.text_input(f"Ask questions about '{article_name}'", '')
+    st.header("Questions and Answers:")
+    if question:
+        contexts = db.get_contexts(question.lower())
+        # print(contexts)
+        data = {
+            'question': question.lower(),
+            'context': contexts['matches'][0]['metadata']['text']
+        }
+        qa = QuestionAnswer(data, reader, tokenizer, 'cpu')
+        results = qa.get_results()
+        paragraph_index = contexts['matches'][0]['id']
+        section = contexts['matches'][0]['metadata']['section']
+        answer = ''
+        for r in results:
+            answer += r['text'] + ", "
+        answer = answer[:len(answer) - 2]
+        st.session_state.qas.append(
+            {
+                'question': question,
+                'answer': answer,
+                'section': section,
+                'para': paragraph_index
+            }
+        )
+        if len(st.session_state.qas) > 0:
+            for data in st.session_state.qas:
+                st.text(
+                    "Question: " +  data['question'] + '\n' +
+                    "Answer: " +  data['answer'] + '\n' +
+                    "Section: " + data['section'] + '\n' +
+                    "Paragraph #: " + data['para']
+                )
+if st.session_state.found_article == False:
+    get_article(retreiver)
+else:
+    ask_questions(st.session_state.article_name, st.session_state.db)