File size: 16,875 Bytes
3d10977
d8edfd0
bd4cf9f
c7cccfe
9500a11
b245886
 
3d10977
 
 
7b33127
3d10977
f2b36f2
5c1eb47
85498e2
3d10977
 
 
 
 
f2b36f2
 
7b33127
dec3f0d
3d10977
cdefac5
 
f1deeaa
 
0ff8527
ecd8d62
e2c1771
c302b97
2abc03b
f2b36f2
 
 
 
a7d861a
baa7056
3d10977
 
 
004a7b1
c7cccfe
 
baa7056
db10537
9500a11
 
 
ea189c9
3df47dd
9500a11
ea189c9
9500a11
ea189c9
9500a11
 
ea189c9
3df47dd
 
9500a11
 
 
3d10977
 
 
 
 
 
9500a11
3df47dd
4ea57ed
e4ecf7c
 
 
3df47dd
 
9500a11
 
 
 
 
 
 
 
 
27b4106
9500a11
 
3d10977
9500a11
3d10977
9500a11
75e83a9
 
9500a11
 
 
3df47dd
9500a11
 
 
 
 
 
 
 
 
75e83a9
 
 
016504f
9500a11
75e83a9
9500a11
 
75e83a9
 
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f443d89
016504f
 
 
 
75e83a9
 
 
9500a11
f443d89
 
9500a11
27b4106
3d10977
f443d89
 
9500a11
 
 
 
 
 
 
 
85498e2
9500a11
 
 
 
75e83a9
9500a11
 
 
 
016504f
 
9500a11
 
 
016504f
 
9500a11
 
 
 
 
 
75e83a9
 
9500a11
27b4106
9500a11
 
27b4106
 
eea4fac
27b4106
85498e2
9d94f56
85498e2
9b56a83
 
27b4106
85498e2
 
 
 
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d10977
9500a11
 
 
 
 
 
 
 
 
 
dec3f0d
3d10977
15b3851
b1cff41
 
 
dec3f0d
5c1eb47
3d10977
 
 
 
 
dec3f0d
3d10977
 
dec3f0d
 
eb98b33
5c1eb47
3d10977
 
5c1eb47
3d10977
016504f
5c1eb47
 
 
 
 
9500a11
5c1eb47
dec3f0d
5c1eb47
 
 
 
 
 
 
 
 
 
 
dec3f0d
5c1eb47
 
 
3d10977
dec3f0d
3d10977
9500a11
 
 
 
 
 
 
 
 
3d10977
 
 
 
 
 
 
 
 
 
dec3f0d
3d10977
 
dec3f0d
3d10977
dec3f0d
3d10977
 
 
 
 
 
dec3f0d
3d10977
 
dec3f0d
3d10977
dec3f0d
3d10977
 
 
 
 
 
 
 
dec3f0d
3d10977
 
dec3f0d
3d10977
 
dec3f0d
3d10977
dec3f0d
3d10977
 
 
 
 
 
 
dec3f0d
3d10977
dec3f0d
3d10977
 
dec3f0d
3d10977
dec3f0d
3d10977
 
 
9500a11
3d10977
 
9500a11
3d10977
 
9500a11
3d10977
 
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
 
5c1eb47
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
5c1eb47
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c1eb47
9500a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c1eb47
9500a11
 
 
 
 
5c1eb47
 
9500a11
 
3d10977
bb8f849
27b4106
bb8f849
b1efb37
bb8f849
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
import os , json
from flask import Flask, render_template
import threading
import time 
from pydantic.v1.utils import unique_list
import requests

from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper, search
from langchain_community.tools import DuckDuckGoSearchResults

from langchain_community.utilities import DuckDuckGoSearchAPIWrapper



API_URL0 = "https://api-inference.huggingface.co/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
API_URL1 = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2"
API_URL2 = "https://api-inference.huggingface.co/models/sentence-transformers/all-roberta-large-v1"
API_URL3 = "https://api-inference.huggingface.co/models/Snowflake/snowflake-arctic-embed-l-v2.0"
# API_URL4 = "https://api-inference.huggingface.co/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"



search = GoogleSearchAPIWrapper()

bearer = "Bearer " + os.getenv('TOKEN')
headers = {"Authorization": bearer }
print("headers")
print(headers)

app = Flask(__name__)

@app.route('/app')
def server_app():
    llamafile = threading.Thread(target=threadserver)
    print('This /app will start the llamafile server on thread')
    llamafile.start()
    return 'llamafile.start()'

@app.route('/findsimilarity')
def server_one():     
    sourcesim = "Results" 
    s1 = "Results"    
    return render_template("similarity_1.html", sourcetxt = sourcesim, s1 = s1 , headertxt = bearer )


@app.route('/')
async def server_1():
    # TODO :: check html first then check similarity
    # TODO :: check parts of snipp to pass in the processing func 
    query_sentence = "capital city of the Philippines"
    duck_results = []
    all_results = []

    try:
        searchduck = DuckDuckGoSearchResults(output_format="list", num_results=20)
        duck_results = searchduck.invoke(query_sentence)
        print("type of duck")
        print(type(duck_results))
    except:
        print("An exception occurred")
        duck_results = []

    if type(duck_results) == list and len(duck_results) > 0 :
        all_results = duck_results   
    
    tool = Tool(
        name="google_search",
        description="Search Google for recent results.",
        func=search.run,
    )
    
    try:
        google_results = search.results( query_sentence , 10 )
        print("type(duck_results)") 
        print(type(duck_results))  
        print(type(all_results))  
    except:
        print("An exception occurred") 
    
    if type(google_results) == list  and len(google_results) > 0:
        all_results = all_results + google_results
        print("len of google and duck")
        print(len(all_results))
        print(len(google_results))
        print(len(duck_results))
    print("type of google")
    print(type(google_results))
    # print(all_results)
    all_snipps = []
    new_results = []
    # get the snippet put into list 
    split_query_words = query_sentence.split(); important_keywords = []; uppercased_keywords = [];
    for x in split_query_words:
        print(" x.isupper()  ")
        # print(x)
        # print( x[0].isupper()  )
        if x[0].isupper() == True :
            uppercased_keywords.append(x)
        if ( len(x) > 3 ) & ( x[0].isupper() == False ):             
            important_keywords.append(x)
    print("what is important and upper")
    print(important_keywords)
    print(uppercased_keywords)
    snipp_score = 0 
    capitalized_score = 0       
    for x in all_results:  
        snipp_score = 0 
        capitalized_score = 0  
        for words in important_keywords:
            # print("The important words " )
            # print(words)
            # print("x[snippet].find(words)")
            # print(x["snippet"].find(words))
            if x["snippet"].find(words) != -1 :
                # print("Found word")
                snipp_score = snipp_score + 1
        for words in uppercased_keywords:
            # print("The important words capitalized" )
            # print(words)
            if x["snippet"].find(words) != -1 :
                snipp_score = snipp_score + 1
                capitalized_score = capitalized_score + 1
            
        if ( snipp_score >= len(important_keywords) ) and ( ( capitalized_score <=  len(uppercased_keywords) and capitalized_score > 0 ) or ( len(uppercased_keywords) == 0 )  ):
            new_results.append(x)
            continue 
        if ( (snipp_score <= len(important_keywords) and snipp_score >= 2 ) and (len(important_keywords) <= 4) )  and ( (capitalized_score <= len(uppercased_keywords) and capitalized_score >= 1) or ( len(uppercased_keywords) == 0 )  ):
            new_results.append(x)
            continue 
        if ( ( snipp_score <= len(important_keywords) and snipp_score >= 4  ) and ( len(important_keywords) >= 5 and len(important_keywords) <= 7 )  ) and ( ( capitalized_score <=  len(uppercased_keywords) and capitalized_score > 0 ) or ( len(uppercased_keywords) == 0 )  ) :
            new_results.append(x)
            continue 
        else :
            # skip the result
            print("This is not added")
            # print(x["snippet"])
            # print("important keywords")
            # print(important_keywords)
            # print("capitalized_score")
            # print(capitalized_score)
            # print("snipp_score")
            # print(snipp_score)

    # print("these are new_results")
    # print("===============================")     

    # print(new_results)  
    
    # print("these are new_results")
    # print("===============================")     
    
    print( " len( new_results)  ")
    print( len( new_results)  ) 
    print("type of all_results")
    # TODO :: check html first then check similarity
    # TODO :: check parts of snipp to pass in the processing func     
    # TODO :: pull pages and split each html and count occurance of important keywords here & check snipp if snipp occurs between . and <p> its good not img
    
    n_results = {}
    iter_x = 0
    for x in new_results: 
        n_results[iter_x] = []
        print("x[snippet]")
        # print(x["snippet"])
        for y in (x["snippet"]).split('.') :            
            score = 0 ; cap_score  = 0 ;  
            for words in important_keywords :
                if y.find(words) != -1 :
                    # print(y)
                    # print(score)
                    score = score + 1 
            for words in uppercased_keywords :
                if y.find(words) != -1 :
                    # print(y)
                    # print(cap_score)
                    cap_score = cap_score + 1              
            if ( score == ( len(important_keywords) )  ) and ( cap_score >= ( len(uppercased_keywords) ) ):
                n_results[iter_x].append(y)            
            if ( score >= ( len(important_keywords)-1  )  ) or ( cap_score >=  len(uppercased_keywords) and (len(uppercased_keywords) > 0)  ):
                n_results[iter_x].append(y)
        iter_x = iter_x + 1
        # print("iterator") 
        # print(iter_x)

    print("n_results length")
    print(len(n_results))
    print("nresults")
    sentences_comparison = []
    iter_x = 0
    for y in n_results : 
        print("y")
        print(n_results[iter_x])
        print(y)
        # print(y) 
        for x in n_results[iter_x] : 
            sentences_comparison.append(x)
        iter_x = iter_x + 1
    
    print("sentences_comparison")
    print(sentences_comparison)

#     nresults={}
#     new_results loop
#        sentences loop
#           score = 0 ; cap_score  = 0
#           words loop 
#              if found score ++
#       
#          capitalized loop 
#              if found cap_score ++
#           if cap_score >= len words &&  if score >= len words 
#           
#          
#                 nresults[i].append(x)

                   





        



    # TODO :: check parts of snipp 
    # TODO :: check parts of snipp 
    # TODO :: check parts of snipp  
    
    payload = {  "inputs": {  "source_sentence": "Manila is the capital city of the Philippines",  "sentences": ["The current capital city, Manila, has been the countrys capital throughout most","Manila officially the City of Manila (Filipino: Lungsod ng Maynila),","Dis 4, 2024 — Manila, capital and chief city of the Philippines. The city is the centre ","Quezon City is the capital of the Philippines","Manila is the capital of the philippines","For sometime Manila has been the capital of of the Philippines" ,"What is the capital of Philippines","Manila is not the capital of the Phillipines","Quezon city was the capital of the Philippines, until President Ferdinand "] } , }
    response0 =  requests.post(API_URL0, headers=headers, json=payload)    
    response1 =  requests.post(API_URL1, headers=headers, json=payload)
    response2 =  requests.post(API_URL2, headers=headers, json=payload)
    response3 =  requests.post(API_URL3, headers=headers, json=payload)
    
    varcontinue_similarity = 0
    print("type( response0.json() )")
    print(type(  response0.json() ))
    print(type(  response1.json() ))
    print(type(  response2.json() ))
    print(type(  response3.json() ))
    
    if type(response0.json()) == list and type(response1.json()) == list and type(response2.json()) == list and type(response3.json()) == list : 
        similarity_scores =  response0.json() + response1.json() + response2.json() + response3.json()
        # If all list then pass to process func 
        sorted0 = sorted(response0.json() , reverse=True); sorted1 = sorted(response1.json() , reverse=True)
        sorted2 = sorted(response2.json() , reverse=True); sorted3 = sorted(response3.json() , reverse=True)
        varcontinue_similarity = 1
    else:
        similarity_scores = "There's an error in llm similarity search retrieval"
        return similarity_scores
    
    time.sleep(2) 
    result_processed = ""
    ## if response is all list  
    if varcontinue_similarity == 1 :
        # call processing with 10 google search result or 15 search results
        if len(all_results) == 10 :
            result_processed = process_similarity_15(sorted0, sorted1, sorted2, sorted3,response0.json(), response1.json(), response2.json(), response3.json()  )
        if len(all_results) > 10 :
            result_processed = process_similarity_15(sorted0, sorted1, sorted2, sorted3,response0.json(), response1.json(), response2.json(), response3.json()  )
    # return all_results
    return result_processed
    

        
def threadserver():
    print('hi')
    os.system(' ./mxbai-embed-large-v1-f16.llamafile --server --nobrowser')


 
def process_similarity_15(sorted0, sorted1, sorted2, sorted3, actualscore0, actualscore1, actualscore2, actualscore3):
    
    # print(similarity_scores)
    # print(type(similarity_scores))  
    print("length")
    # print(len(similarity_scores)) 
    key_index = 0 
    # copy + loop to get index  
    print("actual scores")
    print("actual scores")
    print(actualscore0)
    print(actualscore1)
    print(actualscore2)
    print(actualscore3)

    print("the sorted0-3")
    print("the sorted0-3")
    print(sorted0)
    print(sorted1)
    print(sorted2)
    print(sorted3)
    print("end the sorted0-3")
    # Get the index of the sorted list for resp_list0

    sorted0_with_index = []
    for x in sorted0:
        for y in actualscore0:
            if x == y:
                print("index of sorted0")
                print(actualscore0.index(y))
                if x > 0.90:
                    sorted0_with_index.append(actualscore0.index(y))
                    print("sorted_with_index")
                    print(sorted0_with_index)
    print("sorted0_with_index")    
    print(sorted0_with_index)    
    sorted1_with_index = []
    for x in sorted1:
        for y in actualscore1:
            if x == y:
                print("index of sorted1")
                print(actualscore1.index(y))
                if y > 0.90:
                    sorted1_with_index.append(actualscore1.index(y))
                    print("sorted_with_index")
                    print(sorted1_with_index)
    
    print("sorted1_with_index")
    print(sorted1_with_index)
    
    sorted2_with_index = []
    print("b4 for x in sorted2:")
    print("resp_list2:" + str(actualscore2))
    print("sorted:" + str(sorted2))
    for x in sorted2:
        for y in actualscore2:
            if x == y:
                print("index of sorted2")
                print(actualscore2.index(y))
                if y > 0.90:
                    sorted2_with_index.append(actualscore2.index(y))
                    print("sorted_with_index")
                    print(sorted2_with_index)
  
    print("sorted2_with_index")
    print(sorted2_with_index)
    sorted3_with_index = []
    print("b4 for x in sorted3:")
    print("resp_list3:" + str(actualscore3))
    for x in sorted3:
        for y in actualscore3:
            if x == y:
                print("index of sorted3")
                print(actualscore3.index(y))
                if y > 0.90:
                    sorted3_with_index.append(actualscore3.index(y))
                    print("sorted_with_index")
                    print(sorted3_with_index)
   
    print("sorted0-3_with_index")
    print("sorted0-3_with_index")
    print(sorted0_with_index)
    print(sorted1_with_index) 
    print(sorted2_with_index)
    print(sorted3_with_index)
    print("sorted0-3_with_index")
    

    # At this point the scores have been sorted also indexes are stored in lists 
    # At this point the scores have been sorted also indexes are stored in lists 
    this_unique_list = set( sorted0_with_index + sorted1_with_index + sorted2_with_index + sorted3_with_index )
    webgraph_list = [] 
    iterator_x = 0
    for x in sorted0_with_index:
        print(x)
        if ( x in sorted3_with_index and x in sorted1_with_index and x in sorted2_with_index ) :
            webgraph_list.append(x)
        if ( x in sorted1_with_index and x in sorted2_with_index ) or ( x in sorted3_with_index and x in sorted2_with_index ) or ( x in sorted1_with_index and x in sorted3_with_index ):
            webgraph_list.append(x)
        if (x in sorted1_with_index or x in sorted2_with_index or x in sorted3_with_index ) and actualscore0[iterator_x] > 0.96 :
            webgraph_list.append(x)
        iterator_x = iterator_x + 1
    
    print("webgraph_list0")
    print("webgraph_list0")
    print(webgraph_list)
    iterator_x = 0
    for x in sorted1_with_index:
        print(x)
        if x in sorted3_with_index and x in sorted0_with_index and x in sorted2_with_index :
            webgraph_list.append(x)
        if ( x in sorted0_with_index and x in sorted2_with_index ) or ( x in sorted3_with_index and x in sorted2_with_index ) or ( x in sorted0_with_index and x in sorted3_with_index ):
            webgraph_list.append(x)
        if (x in sorted0_with_index or x in sorted2_with_index or x in sorted3_with_index ) and actualscore1[iterator_x] > 0.96 :
            webgraph_list.append(x)
        iterator_x = iterator_x + 1
    
    print("webgraph_list1")
    print("webgraph_list1")
    print(webgraph_list)


    iterator_x = 0
    for x in sorted2_with_index:
        print(x)
        if x in sorted3_with_index and x in sorted0_with_index and x in sorted1_with_index :
            webgraph_list.append(x)
        if ( x in sorted0_with_index and x in sorted1_with_index ) or ( x in sorted3_with_index and x in sorted1_with_index ) or ( x in sorted0_with_index and x in sorted3_with_index ):
            webgraph_list.append(x)
        if (x in sorted0_with_index or x in sorted1_with_index or x in sorted3_with_index ) and actualscore2[iterator_x] > 0.96 :
            webgraph_list.append(x)
        iterator_x = iterator_x + 1
    
    print("webgraph_list2")
    print("webgraph_list2")
    print(webgraph_list)


    iterator_x = 0
    for x in sorted3_with_index:
        print(x)
        if x in sorted1_with_index and x in sorted0_with_index and x in sorted2_with_index :
            webgraph_list.append(x)
        if ( x in sorted0_with_index and x in sorted2_with_index ) or ( x in sorted1_with_index and x in sorted2_with_index ) or ( x in sorted0_with_index and x in sorted1_with_index ):
            webgraph_list.append(x)
        if (x in sorted0_with_index or x in sorted2_with_index or x in sorted1_with_index ) and actualscore3[iterator_x] > 0.96 :
            webgraph_list.append(x)
        iterator_x = iterator_x + 1
    
    print("webgraph_list3")
    print("webgraph_list3")
    print(webgraph_list)
    print("webgraph_list")
    print(webgraph_list)


    return str( list(set(webgraph_list ) ) ) 
 

if __name__ == '__main__':   
    app.run(host='0.0.0.0', port=8081)
    
    # server_app()