Spaces:
Running
Running
from urllib.request import urlopen, Request | |
from googleapiclient.discovery import build | |
import requests | |
import httpx | |
import re | |
from bs4 import BeautifulSoup | |
import re, math | |
from collections import Counter | |
import numpy as np | |
import asyncio | |
import nltk | |
from sentence_transformers import SentenceTransformer, util | |
import threading | |
import torch | |
import re | |
import numpy as np | |
import asyncio | |
from datetime import date | |
import nltk | |
from unidecode import unidecode | |
from scipy.special import softmax | |
from transformers import AutoTokenizer | |
import yaml | |
import fitz | |
import os | |
def remove_accents(input_str): | |
text_no_accents = unidecode(input_str) | |
return text_no_accents | |
def remove_special_characters(text): | |
text = remove_accents(text) | |
pattern = r'[^\w\s\d.,!?\'"()-;]+' | |
text = re.sub(pattern, "", text) | |
return text | |
def remove_special_characters_2(text): | |
pattern = r"[^a-zA-Z0-9 ]+" | |
text = re.sub(pattern, "", text) | |
return text | |
def update_character_count(text): | |
return f"{len(text)} characters" | |
nltk.download("punkt") | |
with open("config.yaml", "r") as file: | |
params = yaml.safe_load(file) | |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"] | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) | |
def len_validator(text): | |
min_tokens = 200 | |
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) | |
if lengt < min_tokens: | |
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." | |
else: | |
return f"Input length ({lengt}) is satisified." | |
def extract_text_from_pdf(pdf_path): | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
WORD = re.compile(r"\w+") | |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
# returns cosine similarity of two vectors | |
# input: two vectors | |
# output: integer between 0 and 1. | |
# def get_cosine(vec1, vec2): | |
# intersection = set(vec1.keys()) & set(vec2.keys()) | |
# # calculating numerator | |
# numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
# # calculating denominator | |
# sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) | |
# sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) | |
# denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
# # checking for divide by zero | |
# if denominator == 0: | |
# return 0.0 | |
# else: | |
# return float(numerator) / denominator | |
# # converts given text into a vector | |
# def text_to_vector(text): | |
# # uses the Regular expression above and gets all words | |
# words = WORD.findall(text) | |
# # returns a counter of all the words (count of number of occurences) | |
# return Counter(words) | |
# # returns cosine similarity of two words | |
# # uses: text_to_vector(text) and get_cosine(v1,v2) | |
# def cosineSim(text1, text2): | |
# vector1 = text_to_vector(text1) | |
# vector2 = text_to_vector(text2) | |
# # print vector1,vector2 | |
# cosine = get_cosine(vector1, vector2) | |
# return cosine | |
# def cos_sim_torch(embedding_1, embedding_2): | |
# return util.pytorch_cos_sim(embedding_1, embedding_2).item() | |
# def embed_text(text): | |
# return model.encode(text, convert_to_tensor=True) | |
# def sentence_similarity(text1, text2): | |
# embedding_1 = model.encode(text1, convert_to_tensor=True) | |
# embedding_2 = model.encode(text2, convert_to_tensor=True) | |
# o = util.pytorch_cos_sim(embedding_1, embedding_2) | |
# return o.item() | |
# def get_soup_requests(url): | |
# page = requests.get(url) | |
# if page.status_code == 200: | |
# soup = BeautifulSoup(page.content, "html.parser") | |
# return soup | |
# print("HTML soup failed") | |
# return None | |
# def get_soup_httpx(url): | |
# client = httpx.Client(timeout=30) | |
# try: | |
# page = client.get(url) | |
# if page.status_code == httpx.codes.OK: | |
# soup = BeautifulSoup(page.content, "html.parser") | |
# return soup | |
# except: | |
# print("HTTPx soup failed") | |
# return None | |
# def getSentences(text): | |
# from nltk.tokenize import sent_tokenize | |
# sents = sent_tokenize(text) | |
# two_sents = [] | |
# for i in range(len(sents)): | |
# if (i % 2) == 0: | |
# two_sents.append(sents[i]) | |
# else: | |
# two_sents[len(two_sents) - 1] += " " + sents[i] | |
# return two_sents | |
# def googleSearch( | |
# plag_option, | |
# sentences, | |
# urlCount, | |
# scoreArray, | |
# urlList, | |
# sorted_date, | |
# domains_to_skip, | |
# api_key, | |
# cse_id, | |
# **kwargs, | |
# ): | |
# service = build("customsearch", "v1", developerKey=api_key) | |
# for i, sentence in enumerate(sentences): | |
# results = ( | |
# service.cse() | |
# .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs) | |
# .execute() | |
# ) | |
# if "items" in results and len(results["items"]) > 0: | |
# for count, link in enumerate(results["items"]): | |
# # stop after 3 pages | |
# if count >= 3: | |
# break | |
# # skip user selected domains | |
# if any( | |
# ("." + domain) in link["link"] for domain in domains_to_skip | |
# ): | |
# continue | |
# # clean up snippet of '...' | |
# snippet = link["snippet"] | |
# ind = snippet.find("...") | |
# if ind < 20 and ind > 9: | |
# snippet = snippet[ind + len("... ") :] | |
# ind = snippet.find("...") | |
# if ind > len(snippet) - 5: | |
# snippet = snippet[:ind] | |
# # update cosine similarity between snippet and given text | |
# url = link["link"] | |
# if url not in urlList: | |
# urlList.append(url) | |
# scoreArray.append([0] * len(sentences)) | |
# urlCount[url] = urlCount[url] + 1 if url in urlCount else 1 | |
# if plag_option == "Standard": | |
# scoreArray[urlList.index(url)][i] = cosineSim( | |
# sentence, snippet | |
# ) | |
# else: | |
# scoreArray[urlList.index(url)][i] = sentence_similarity( | |
# sentence, snippet | |
# ) | |
# else: | |
# print("Google Search failed") | |
# return urlCount, scoreArray | |
# def getQueries(text, n): | |
# # return n-grams of size n | |
# words = text.split() | |
# return [words[i : i + n] for i in range(len(words) - n + 1)] | |
# def print2D(array): | |
# print(np.array(array)) | |
# def removePunc(text): | |
# res = re.sub(r"[^\w\s]", "", text) | |
# return res | |
# async def get_url_data(url, client): | |
# try: | |
# r = await client.get(url) | |
# # print(r.status_code) | |
# if r.status_code == 200: | |
# # print("in") | |
# soup = BeautifulSoup(r.content, "html.parser") | |
# return soup | |
# except Exception: | |
# print("HTTPx parallel soup failed") | |
# return None | |
# async def parallel_scrap(urls): | |
# async with httpx.AsyncClient(timeout=30) as client: | |
# tasks = [] | |
# for url in urls: | |
# tasks.append(get_url_data(url=url, client=client)) | |
# results = await asyncio.gather(*tasks, return_exceptions=True) | |
# return results | |
# class TimeoutError(Exception): | |
# pass | |
# def matchingScore(sentence, content): | |
# if sentence in content: | |
# return 1 | |
# sentence = removePunc(sentence) | |
# content = removePunc(content) | |
# if sentence in content: | |
# return 1 | |
# else: | |
# n = 5 | |
# ngrams = getQueries(sentence, n) | |
# if len(ngrams) == 0: | |
# return 0 | |
# matched = [x for x in ngrams if " ".join(x) in content] | |
# return len(matched) / len(ngrams) | |
# # def matchingScoreWithTimeout(sentence, content): | |
# # def timeout_handler(): | |
# # raise TimeoutError("Function timed out") | |
# # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds | |
# # timer.start() | |
# # try: | |
# # score = sentence_similarity(sentence, content) | |
# # # score = matchingScore(sentence, content) | |
# # timer.cancel() # Cancel the timer if calculation completes before timeout | |
# # return score | |
# # except TimeoutError: | |
# # return 0 | |
# # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray): | |
# # content = removePunc(content) | |
# # for j, sentence in enumerate(sentences): | |
# # sentence = removePunc(sentence) | |
# # if sentence in content: | |
# # ScoreArray[content_idx][j] = 1 | |
# # else: | |
# # n = 5 | |
# # ngrams = getQueries(sentence, n) | |
# # if len(ngrams) == 0: | |
# # return 0 | |
# # matched = [x for x in ngrams if " ".join(x) in content] | |
# # ScoreArray[content_idx][j] = len(matched) / len(ngrams) | |
# # print( | |
# # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................" | |
# # ) | |
# # return ScoreArray | |
# async def matchingScoreAsync( | |
# sentences, content, content_idx, ScoreArray, model, util | |
# ): | |
# content = removePunc(content) | |
# for j, sentence in enumerate(sentences): | |
# sentence = removePunc(sentence) | |
# similarity_score = sentence_similarity(sentence, content, model, util) | |
# ScoreArray[content_idx][j] = similarity_score | |
# print( | |
# f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................" | |
# ) | |
# return ScoreArray | |
# async def parallel_analyze(soups, sentences, ScoreArray): | |
# tasks = [] | |
# for i, soup in enumerate(soups): | |
# if soup: | |
# page_content = soup.text | |
# tasks.append( | |
# matchingScoreAsync(sentences, page_content, i, ScoreArray) | |
# ) | |
# else: | |
# print( | |
# f"Analyzed {i+1} of soups (SOUP FAILED)........................" | |
# ) | |
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True) | |
# return ScoreArray | |
# async def parallel_analyze_2(soups, sentences, ScoreArray): | |
# tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))] | |
# for i, soup in enumerate(soups): | |
# if soup: | |
# page_content = soup.text | |
# for j, sent in enumerate(sentences): | |
# print( | |
# f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................" | |
# ) | |
# tasks[i][j] = sentence_similarity(sent, page_content) | |
# else: | |
# print( | |
# f"Analyzed {i+1} of soups (SOUP FAILED)........................" | |
# ) | |
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True) | |
# return ScoreArray | |