ArturG9 commited on
Commit
f9074e2
·
verified ·
1 Parent(s): a9c6eb7

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +113 -2
functions.py CHANGED
@@ -88,8 +88,119 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
88
 
89
  else:
90
  st.write("Vector store doesnt exist and will be created now")
91
- loader = DirectoryLoader('./data/', glob="./*.txt", loader_cls=TextLoader)
92
- docs = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
 
88
 
89
  else:
90
  st.write("Vector store doesnt exist and will be created now")
91
+ urls = [
92
+
93
+ "https://github.com/zedr/clean-code-python",
94
+ "https://tenthousandmeters.com/blog/python-behind-the-scenes-10-how-python-dictionaries-work/",
95
+ "https://realpython.com/python-testing/",
96
+ "https://docs.python-guide.org/writing/license/",
97
+ "https://blogs.nvidia.com/blog/what-is-a-transformer-model/",
98
+ "https://research.google/blog/transformer-a-novel-neural-network-architecture-for-language-understanding/",
99
+ "https://realpython.com/python-pep8/",
100
+ "https://towardsdatascience.com/ideal-python-environment-setup-for-data-science-cdb03a447de8",
101
+ "https://realpython.com/python3-object-oriented-programming/",
102
+ "https://realpython.com/python-functional-programming/",
103
+ "https://fivethirtyeight.com/features/science-isnt-broken/",
104
+ "https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
105
+ "https://towardsdatascience.com/why-is-data-science-failing-to-solve-the-right-problems-7b5b6121e3b4",
106
+ "https://medium.com/@srowen/common-probability-distributions-347e6b945ce4",
107
+ "https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
108
+ "https://scikit-learn.org/stable/modules/compose.html",
109
+ "https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/",
110
+ "https://neptune.ai/blog/xgboost-vs-lightgbm",
111
+ "https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27",
112
+ "https://www.cio.com/article/247005/what-are-containers-and-why-do-you-need-them.html",
113
+ "https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained",
114
+ "https://towardsdatascience.com/making-friends-with-machine-learning-5e28d5205a29",
115
+ "https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28",
116
+ "https://machinelearningmastery.com/multi-class-imbalanced-classification/",
117
+ "https://imbalanced-learn.org/stable/auto_examples/applications/plot_impact_imbalanced_classes.html",
118
+ "https://docs.ray.io/en/master/tune/examples/tune-sklearn.html",
119
+ "https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy",
120
+ "https://cs231n.github.io/optimization-2/",
121
+ "https://alexander-schiendorfer.github.io/2020/02/24/a-worked-example-of-backprop.html",
122
+ "https://www.analyticsvidhya.com/blog/2020/01/fundamentals-deep-learning-activation-functions-when-to-use-them/",
123
+ "https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html",
124
+ "https://d2l.ai/chapter_multilayer-perceptrons/mlp.html",
125
+ "https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
126
+ "https://d2l.ai/chapter_optimization/",
127
+ "https://www.investopedia.com/terms/s/statistical-significance.asp",
128
+ "https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
129
+ "https://d2l.ai/chapter_convolutional-neural-networks/why-conv.html",
130
+ "https://d2l.ai/chapter_convolutional-modern/alexnet.html",
131
+ "https://d2l.ai/chapter_convolutional-modern/vgg.html",
132
+ "https://d2l.ai/chapter_convolutional-modern/nin.html",
133
+ "https://d2l.ai/chapter_convolutional-modern/googlenet.html",
134
+ 'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/',
135
+ 'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/string/',
136
+ 'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/comparison/',
137
+ 'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/trajectory/',
138
+ "https://langchain-ai.github.io/langgraph/concepts/high_level/#why-langgraph",
139
+ 'https://langchain-ai.github.io/langgraph/concepts/low_level/#only-stream-tokens-from-specific-nodesllms',
140
+ "https://langchain-ai.github.io/langgraph/concepts/agentic_concepts/#reflection",
141
+ "https://langchain-ai.github.io/langgraph/concepts/faq/",
142
+ "https://www.geeksforgeeks.org/python-oops-concepts/",
143
+ "https://www.mckinsey.com/featured-insights/mckinsey-explainers/what-is-fintech",
144
+ "https://datascientest.com/en/adversarial-attack-definition-and-protection-against-this-threat",
145
+ "https://datascientest.com/en/all-about-dspy",
146
+ "https://datascientest.com/en/arithmetic-and-data-science",
147
+ "https://datascientest.com/en/all-about-machine-learning-metrics",
148
+ "https://datascientest.com/en/all-about-procedural-programming",
149
+ "https://datascientest.com/en/all-about-cryptography",
150
+ "https://datascientest.com/en/all-about-predictive-coding",
151
+ "https://datascientest.com/en/all-about-network-convergence",
152
+ "https://datascientest.com/en/all-about-forensic-analysis",
153
+ "https://datascientest.com/en/all-about-chatgpt-jailbreak",
154
+ "https://datascientest.com/en/all-about-pentest",
155
+ "https://datascientest.com/en/all-about-embedded-systems",
156
+ "https://datascientest.com/en/all-about-network-operating-system",
157
+ "https://datascientest.com/en/all-about-ai-and-cybersecurity",
158
+ "https://datascientest.com/en/all-about-cybernetics",
159
+ "https://datascientest.com/en/all-about-seo",
160
+ "https://datascientest.com/en/all-about-expert-system",
161
+ "https://datascientest.com/en/all-about-telecommunications",
162
+ "https://datascientest.com/en/all-about-smart-cities",
163
+ "https://datascientest.com/en/all-about-artificial-intelligence-and-finance-sector",
164
+ "https://datascientest.com/en/all-about-generated-pre-trained-transformers",
165
+ "https://datascientest.com/en/all-about-iso-27001",
166
+ "https://datascientest.com/en/all-about-smart-sensors",
167
+ "https://datascientest.com/en/all-about-virtual-networks",
168
+ "https://datascientest.com/en/all-about-ethical-ai",
169
+ "https://datascientest.com/en/all-about-saio",
170
+ "https://datascientest.com/en/all-about-recommendation-algorithm",
171
+ "https://www.geeksforgeeks.org/activation-functions-neural-networks/",
172
+ "https://www.geeksforgeeks.org/activation-functions-in-neural-networks-set2/?ref=oin_asr1",
173
+ "https://www.geeksforgeeks.org/choosing-the-right-activation-function-for-your-neural-network/?ref=oin_asr3",
174
+ "https://www.geeksforgeeks.org/difference-between-feed-forward-neural-networks-and-recurrent-neural-networks/?ref=oin_asr2",
175
+ "https://www.geeksforgeeks.org/recurrent-neural-networks-explanation/?ref=oin_asr11",
176
+ "https://www.geeksforgeeks.org/deeppose-human-pose-estimation-via-deep-neural-networks/?ref=oin_asr13",
177
+ "https://www.geeksforgeeks.org/auto-associative-neural-networks/?ref=oin_asr18",
178
+ "https://www.geeksforgeeks.org/what-are-graph-neural-networks/?ref=oin_asr30",
179
+ "https://hdsr.mitpress.mit.edu/pub/la3vitqm/release/2",
180
+ "https://datasciencedojo.com/blog/a-guide-to-large-language-models/",
181
+ "https://datasciencedojo.com/blog/bootstrap-sampling/",
182
+ "https://datasciencedojo.com/blog/top-statistical-concepts/",
183
+ "https://datasciencedojo.com/blog/probability-for-data-science/",
184
+ "https://datasciencedojo.com/blog/top-statistical-techniques/",
185
+ "https://datasciencedojo.com/blog/statistical-distributions/",
186
+ "https://datasciencedojo.com/blog/data-science-in-finance/",
187
+ "https://datasciencedojo.com/blog/random-forest-algorithm/",
188
+ "https://datasciencedojo.com/blog/gini-index-and-entropy/",
189
+ "https://datasciencedojo.com/blog/boosting-algorithms-in-machine-learning/",
190
+ "https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
191
+ "https://datasciencedojo.com/blog/langgraph-tutorial/",
192
+ "https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
193
+ "https://datasciencedojo.com/blog/on-device-ai/",
194
+
195
+ def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
196
+ data = []
197
+ for link in links:
198
+ loader = NewsURLLoader(urls=[link])
199
+ data += loader.load()
200
+ return data
201
+
202
+
203
+ docs = extract_sentences_from_web(links=urls)
204
 
205
 
206
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(