deepaksarika01 commited on
Commit
7cf68b3
1 Parent(s): 2415a29

Upload 5 files

Browse files
Files changed (4) hide show
  1. main.py +1 -0
  2. model.py +128 -0
  3. requirements.txt +11 -0
  4. utils.py +41 -0
main.py ADDED
@@ -0,0 +1 @@
 
 
1
+
model.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms import HuggingFacePipeline
2
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
3
+ from langchain.chains import RetrievalQA
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForSeq2SeqLM,
7
+ pipeline,
8
+ GenerationConfig
9
+ )
10
+
11
+ class lamini:
12
+ def __init__(self):
13
+ pass
14
+
15
+ def load_model(self, task="text2text-generation", **kwargs) -> HuggingFacePipeline:
16
+ """Returns a pipeline for the model
17
+ - model: MBZUAI/LaMini-Flan-T5-248M
18
+
19
+ Returns:
20
+ _type_: _description_
21
+ """
22
+ model_id = "MBZUAI/LaMini-Flan-T5-248M"
23
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
24
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
25
+ gen_config = GenerationConfig.from_pretrained(model_id)
26
+
27
+ max_length = kwargs.get("max_length", 512)
28
+ temperature = kwargs.get("temperature", 0)
29
+ top_p = kwargs.get("top_p", 0.95)
30
+ repetition_penalty = kwargs.get("repetition_penalty", 1.15)
31
+
32
+ pipe = pipeline(
33
+ "text2text-generation",
34
+ model=model,
35
+ tokenizer=tokenizer,
36
+ generation_config=gen_config,
37
+ max_length=max_length,
38
+ top_p=top_p,
39
+ temperature=temperature,
40
+ repetition_penalty=repetition_penalty,
41
+ )
42
+
43
+ llm = HuggingFacePipeline(pipeline=pipe)
44
+ return llm
45
+
46
+ class templates:
47
+ def __init__(self, llm: HuggingFacePipeline):
48
+ self.llm = llm
49
+
50
+ def summarize(self, text, **kwargs):
51
+ """Summarize text
52
+
53
+ Args:
54
+ text (str): text to summarize
55
+
56
+ Returns:
57
+ str: summarized text
58
+ """
59
+
60
+ instruction = "summarize for better understanding: "
61
+ text = instruction + text
62
+ return self.llm(text, **kwargs)
63
+
64
+ def generate_tile(self, text, **kwargs):
65
+ """Generate a title for text
66
+
67
+ Args:
68
+ text (str): text to generate title for
69
+
70
+ Returns:
71
+ str: title
72
+ """
73
+
74
+ instruction = "generate a title for this text: "
75
+ text = instruction + text
76
+ return self.llm(text, **kwargs)
77
+
78
+ class qa_template:
79
+ def __init__(self, llm):
80
+ from langchain.chains.retrieval_qa.base import BaseRetrievalQA
81
+ self.llm = llm
82
+ self.qa_inf: BaseRetrievalQA
83
+
84
+ def load(self, knowledge_base):
85
+ """Load knowledge base
86
+
87
+ Args:
88
+ knowledge_base (str): knowledge base to load
89
+
90
+ Returns:
91
+ BaseRetrievalQA: (optional to use) returns QA interface
92
+ """
93
+ from utils import LangChainChunker
94
+ from langchain.vectorstores import Chroma
95
+ from langchain.chains import RetrievalQA
96
+
97
+ embeds = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
98
+ chunker = LangChainChunker(knowledge_base)
99
+ chunks = chunker.chunker(size=512)
100
+ db = Chroma.from_texts(chunks, embeds)
101
+ retriever = db.as_retriever()
102
+
103
+ qa_inf = RetrievalQA.from_chain_type(
104
+ llm=self.llm, chain_type="stuff", retriever=retriever
105
+ )
106
+
107
+ self.qa_inf = qa_inf
108
+ return qa_inf
109
+
110
+ def start_gradio(self, title: str):
111
+ """Start gradio interface
112
+
113
+ Returns:
114
+ _type_: _description_
115
+ """
116
+ import gradio as gr
117
+
118
+ def interface(msg, history):
119
+ res = self.qa_inf.run(msg)
120
+ return str(res)
121
+
122
+ ui = gr.ChatInterface(
123
+ fn=interface,
124
+ examples=["What is the video about?", "key points of the video"],
125
+ title=f"Question Mode - {title}",
126
+ )
127
+
128
+ ui.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ nltk
4
+ youtube_transcript_api
5
+ accelerate
6
+ langchain
7
+ yt-dlp
8
+ rich
9
+ chromadb
10
+ InstructorEmbedding
11
+ sentence_transformers
utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class LangChainChunker:
2
+ def __init__(self, text):
3
+ self.text = text
4
+
5
+ def chunker(self, size=1000):
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+
8
+ # attach the duration of the video to the chunk
9
+ # [[chunk, duration]]
10
+
11
+ text_splitter = CharacterTextSplitter(
12
+ separator=" ",
13
+ chunk_size=size,
14
+ chunk_overlap=0.9,
15
+ )
16
+
17
+ return text_splitter.split_text(self.text)
18
+
19
+ def __sizeof__(self) -> int:
20
+ count = 0
21
+ for _ in self.text:
22
+ count += 1
23
+ return count
24
+
25
+
26
+ def getSubsText(video_id="", getGenerated=False):
27
+ from youtube_transcript_api import YouTubeTranscriptApi as ytapi
28
+ from youtube_transcript_api.formatters import TextFormatter
29
+
30
+ tList = ytapi.list_transcripts(video_id)
31
+ data = ""
32
+ if getGenerated:
33
+ # TODO: implement getGenerated
34
+ pass
35
+
36
+ for t in tList:
37
+ data = t.fetch()
38
+
39
+ return (TextFormatter().format_transcript(data)).replace("\n", " ")
40
+
41
+