|
from nltk.corpus import wordnet |
|
import re |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
stop_words = ['i', |
|
'me', |
|
'my', |
|
'myself', |
|
'we', |
|
'our', |
|
'ours', |
|
'ourselves', |
|
'you', |
|
"you're", |
|
"you've", |
|
"you'll", |
|
"you'd", |
|
'your', |
|
'yours', |
|
'yourself', |
|
'yourselves', |
|
'he', |
|
'him', |
|
'his', |
|
'himself', |
|
'she', |
|
"she's", |
|
'her', |
|
'hers', |
|
'herself', |
|
'it', |
|
"it's", |
|
'its', |
|
'itself', |
|
'they', |
|
'them', |
|
'their', |
|
'theirs', |
|
'themselves', |
|
'what', |
|
'which', |
|
'who', |
|
'whom', |
|
'this', |
|
'that', |
|
"that'll", |
|
'these', |
|
'those', |
|
'am', |
|
'is', |
|
'are', |
|
'was', |
|
'were', |
|
'be', |
|
'been', |
|
'being', |
|
'have', |
|
'has', |
|
'had', |
|
'having', |
|
'do', |
|
'does', |
|
'did', |
|
'doing', |
|
'a', |
|
'an', |
|
'the', |
|
'and', |
|
'but', |
|
'if', |
|
'or', |
|
'because', |
|
'as', |
|
'until', |
|
'while', |
|
'of', |
|
'at', |
|
'by', |
|
'for', |
|
'with', |
|
'about', |
|
'against', |
|
'between', |
|
'into', |
|
'through', |
|
'during', |
|
'before', |
|
'after', |
|
'above', |
|
'below', |
|
'to', |
|
'from', |
|
'up', |
|
'down', |
|
'in', |
|
'out', |
|
'on', |
|
'off', |
|
'over', |
|
'under', |
|
'again', |
|
'further', |
|
'then', |
|
'once', |
|
'here', |
|
'there', |
|
'when', |
|
'where', |
|
'why', |
|
'how', |
|
'all', |
|
'any', |
|
'both', |
|
'each', |
|
'few', |
|
'more', |
|
'most', |
|
'other', |
|
'some', |
|
'such', |
|
'no', |
|
'nor', |
|
'not', |
|
'only', |
|
'own', |
|
'same', |
|
'so', |
|
'than', |
|
'too', |
|
'very', |
|
's', |
|
't', |
|
'can', |
|
'will', |
|
'just', |
|
'don', |
|
"don't", |
|
'should', |
|
"should've", |
|
'now', |
|
'd', |
|
'll', |
|
'm', |
|
'o', |
|
're', |
|
've', |
|
'y', |
|
'ain', |
|
'aren', |
|
"aren't", |
|
'couldn', |
|
"couldn't", |
|
'didn', |
|
"didn't", |
|
'doesn', |
|
"doesn't", |
|
'hadn', |
|
"hadn't", |
|
'hasn', |
|
"hasn't", |
|
'haven', |
|
"haven't", |
|
'isn', |
|
"isn't", |
|
'ma', |
|
'mightn', |
|
"mightn't", |
|
'mustn', |
|
"mustn't", |
|
'needn', |
|
"needn't", |
|
'shan', |
|
"shan't", |
|
'shouldn', |
|
"shouldn't", |
|
'wasn', |
|
"wasn't", |
|
'weren', |
|
"weren't", |
|
'won', |
|
"won't", |
|
'wouldn', |
|
"wouldn't", |
|
"its", |
|
"whats", |
|
"im", |
|
"youre", |
|
"hes", |
|
"shes", |
|
"were", |
|
"theyre", |
|
"cant", |
|
"dont", |
|
"wont", |
|
"isnt", |
|
"arent", |
|
"wasnt", |
|
"werent", |
|
"couldnt", |
|
"shouldnt", |
|
"wouldnt", |
|
"ive", |
|
"youve", |
|
"weve", |
|
"theyve", |
|
"id", |
|
"youd", |
|
"lets", |
|
"thats", |
|
"theres", |
|
"heres", |
|
"ill", |
|
"hell", |
|
"shell", |
|
"mustnt", |
|
"mightnt", |
|
"shant", |
|
"neednt", |
|
"oclock", |
|
"cause", |
|
"gimme", |
|
"wanna", |
|
"gonna", |
|
"kinda", |
|
"sorta", |
|
"lemme", |
|
"aint", |
|
"dunno", |
|
"gotta", |
|
"yall"] |
|
|
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
|
|
|
|
|
|
HINDI_UNICODE_RANGE = (0x0900, 0x097F) |
|
|
|
|
|
def is_hindi_letter(c): |
|
return ord(c) >= HINDI_UNICODE_RANGE[0] and ord(c) <= HINDI_UNICODE_RANGE[1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def en_hi_detection(text): |
|
text = re.sub(r'[^\w\s]', ' ', text) |
|
|
|
words = text.lower().strip().split() |
|
count_en = 0 |
|
|
|
for word in words: |
|
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]: |
|
|
|
lem_word = lemmatizer.lemmatize(word, pos) |
|
if lem_word in wordnet.words(): |
|
print("wordnet :",lem_word) |
|
count_en+=1 |
|
break |
|
elif lem_word in stop_words: |
|
print("stop_words :",lem_word) |
|
count_en+=1 |
|
break |
|
|
|
|
|
|
|
|
|
|
|
count = 0 |
|
|
|
for word in words: |
|
hindi_letters = [] |
|
for c in word: |
|
if is_hindi_letter(c): |
|
hindi_letters.append(c) |
|
if hindi_letters: |
|
|
|
count+=1 |
|
else: |
|
pass |
|
|
|
|
|
|
|
if count_en/len(words)*100>70: |
|
return "eng" |
|
elif count/len(words)*100>75: |
|
return "hi" |
|
else : |
|
return "unknown" |
|
|
|
|