Spaces:
Sleeping
Sleeping
Update functions.py
Browse files- functions.py +113 -2
functions.py
CHANGED
@@ -88,8 +88,119 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
88 |
|
89 |
else:
|
90 |
st.write("Vector store doesnt exist and will be created now")
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
|
95 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
|
88 |
|
89 |
else:
|
90 |
st.write("Vector store doesnt exist and will be created now")
|
91 |
+
urls = [
|
92 |
+
|
93 |
+
"https://github.com/zedr/clean-code-python",
|
94 |
+
"https://tenthousandmeters.com/blog/python-behind-the-scenes-10-how-python-dictionaries-work/",
|
95 |
+
"https://realpython.com/python-testing/",
|
96 |
+
"https://docs.python-guide.org/writing/license/",
|
97 |
+
"https://blogs.nvidia.com/blog/what-is-a-transformer-model/",
|
98 |
+
"https://research.google/blog/transformer-a-novel-neural-network-architecture-for-language-understanding/",
|
99 |
+
"https://realpython.com/python-pep8/",
|
100 |
+
"https://towardsdatascience.com/ideal-python-environment-setup-for-data-science-cdb03a447de8",
|
101 |
+
"https://realpython.com/python3-object-oriented-programming/",
|
102 |
+
"https://realpython.com/python-functional-programming/",
|
103 |
+
"https://fivethirtyeight.com/features/science-isnt-broken/",
|
104 |
+
"https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
|
105 |
+
"https://towardsdatascience.com/why-is-data-science-failing-to-solve-the-right-problems-7b5b6121e3b4",
|
106 |
+
"https://medium.com/@srowen/common-probability-distributions-347e6b945ce4",
|
107 |
+
"https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
|
108 |
+
"https://scikit-learn.org/stable/modules/compose.html",
|
109 |
+
"https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/",
|
110 |
+
"https://neptune.ai/blog/xgboost-vs-lightgbm",
|
111 |
+
"https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27",
|
112 |
+
"https://www.cio.com/article/247005/what-are-containers-and-why-do-you-need-them.html",
|
113 |
+
"https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained",
|
114 |
+
"https://towardsdatascience.com/making-friends-with-machine-learning-5e28d5205a29",
|
115 |
+
"https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28",
|
116 |
+
"https://machinelearningmastery.com/multi-class-imbalanced-classification/",
|
117 |
+
"https://imbalanced-learn.org/stable/auto_examples/applications/plot_impact_imbalanced_classes.html",
|
118 |
+
"https://docs.ray.io/en/master/tune/examples/tune-sklearn.html",
|
119 |
+
"https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy",
|
120 |
+
"https://cs231n.github.io/optimization-2/",
|
121 |
+
"https://alexander-schiendorfer.github.io/2020/02/24/a-worked-example-of-backprop.html",
|
122 |
+
"https://www.analyticsvidhya.com/blog/2020/01/fundamentals-deep-learning-activation-functions-when-to-use-them/",
|
123 |
+
"https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html",
|
124 |
+
"https://d2l.ai/chapter_multilayer-perceptrons/mlp.html",
|
125 |
+
"https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
|
126 |
+
"https://d2l.ai/chapter_optimization/",
|
127 |
+
"https://www.investopedia.com/terms/s/statistical-significance.asp",
|
128 |
+
"https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
|
129 |
+
"https://d2l.ai/chapter_convolutional-neural-networks/why-conv.html",
|
130 |
+
"https://d2l.ai/chapter_convolutional-modern/alexnet.html",
|
131 |
+
"https://d2l.ai/chapter_convolutional-modern/vgg.html",
|
132 |
+
"https://d2l.ai/chapter_convolutional-modern/nin.html",
|
133 |
+
"https://d2l.ai/chapter_convolutional-modern/googlenet.html",
|
134 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/',
|
135 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/string/',
|
136 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/comparison/',
|
137 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/trajectory/',
|
138 |
+
"https://langchain-ai.github.io/langgraph/concepts/high_level/#why-langgraph",
|
139 |
+
'https://langchain-ai.github.io/langgraph/concepts/low_level/#only-stream-tokens-from-specific-nodesllms',
|
140 |
+
"https://langchain-ai.github.io/langgraph/concepts/agentic_concepts/#reflection",
|
141 |
+
"https://langchain-ai.github.io/langgraph/concepts/faq/",
|
142 |
+
"https://www.geeksforgeeks.org/python-oops-concepts/",
|
143 |
+
"https://www.mckinsey.com/featured-insights/mckinsey-explainers/what-is-fintech",
|
144 |
+
"https://datascientest.com/en/adversarial-attack-definition-and-protection-against-this-threat",
|
145 |
+
"https://datascientest.com/en/all-about-dspy",
|
146 |
+
"https://datascientest.com/en/arithmetic-and-data-science",
|
147 |
+
"https://datascientest.com/en/all-about-machine-learning-metrics",
|
148 |
+
"https://datascientest.com/en/all-about-procedural-programming",
|
149 |
+
"https://datascientest.com/en/all-about-cryptography",
|
150 |
+
"https://datascientest.com/en/all-about-predictive-coding",
|
151 |
+
"https://datascientest.com/en/all-about-network-convergence",
|
152 |
+
"https://datascientest.com/en/all-about-forensic-analysis",
|
153 |
+
"https://datascientest.com/en/all-about-chatgpt-jailbreak",
|
154 |
+
"https://datascientest.com/en/all-about-pentest",
|
155 |
+
"https://datascientest.com/en/all-about-embedded-systems",
|
156 |
+
"https://datascientest.com/en/all-about-network-operating-system",
|
157 |
+
"https://datascientest.com/en/all-about-ai-and-cybersecurity",
|
158 |
+
"https://datascientest.com/en/all-about-cybernetics",
|
159 |
+
"https://datascientest.com/en/all-about-seo",
|
160 |
+
"https://datascientest.com/en/all-about-expert-system",
|
161 |
+
"https://datascientest.com/en/all-about-telecommunications",
|
162 |
+
"https://datascientest.com/en/all-about-smart-cities",
|
163 |
+
"https://datascientest.com/en/all-about-artificial-intelligence-and-finance-sector",
|
164 |
+
"https://datascientest.com/en/all-about-generated-pre-trained-transformers",
|
165 |
+
"https://datascientest.com/en/all-about-iso-27001",
|
166 |
+
"https://datascientest.com/en/all-about-smart-sensors",
|
167 |
+
"https://datascientest.com/en/all-about-virtual-networks",
|
168 |
+
"https://datascientest.com/en/all-about-ethical-ai",
|
169 |
+
"https://datascientest.com/en/all-about-saio",
|
170 |
+
"https://datascientest.com/en/all-about-recommendation-algorithm",
|
171 |
+
"https://www.geeksforgeeks.org/activation-functions-neural-networks/",
|
172 |
+
"https://www.geeksforgeeks.org/activation-functions-in-neural-networks-set2/?ref=oin_asr1",
|
173 |
+
"https://www.geeksforgeeks.org/choosing-the-right-activation-function-for-your-neural-network/?ref=oin_asr3",
|
174 |
+
"https://www.geeksforgeeks.org/difference-between-feed-forward-neural-networks-and-recurrent-neural-networks/?ref=oin_asr2",
|
175 |
+
"https://www.geeksforgeeks.org/recurrent-neural-networks-explanation/?ref=oin_asr11",
|
176 |
+
"https://www.geeksforgeeks.org/deeppose-human-pose-estimation-via-deep-neural-networks/?ref=oin_asr13",
|
177 |
+
"https://www.geeksforgeeks.org/auto-associative-neural-networks/?ref=oin_asr18",
|
178 |
+
"https://www.geeksforgeeks.org/what-are-graph-neural-networks/?ref=oin_asr30",
|
179 |
+
"https://hdsr.mitpress.mit.edu/pub/la3vitqm/release/2",
|
180 |
+
"https://datasciencedojo.com/blog/a-guide-to-large-language-models/",
|
181 |
+
"https://datasciencedojo.com/blog/bootstrap-sampling/",
|
182 |
+
"https://datasciencedojo.com/blog/top-statistical-concepts/",
|
183 |
+
"https://datasciencedojo.com/blog/probability-for-data-science/",
|
184 |
+
"https://datasciencedojo.com/blog/top-statistical-techniques/",
|
185 |
+
"https://datasciencedojo.com/blog/statistical-distributions/",
|
186 |
+
"https://datasciencedojo.com/blog/data-science-in-finance/",
|
187 |
+
"https://datasciencedojo.com/blog/random-forest-algorithm/",
|
188 |
+
"https://datasciencedojo.com/blog/gini-index-and-entropy/",
|
189 |
+
"https://datasciencedojo.com/blog/boosting-algorithms-in-machine-learning/",
|
190 |
+
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
191 |
+
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
192 |
+
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
193 |
+
"https://datasciencedojo.com/blog/on-device-ai/",
|
194 |
+
|
195 |
+
def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
|
196 |
+
data = []
|
197 |
+
for link in links:
|
198 |
+
loader = NewsURLLoader(urls=[link])
|
199 |
+
data += loader.load()
|
200 |
+
return data
|
201 |
+
|
202 |
+
|
203 |
+
docs = extract_sentences_from_web(links=urls)
|
204 |
|
205 |
|
206 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|