Spaces:

pojitha
/

sinhala_hate_speech

Runtime error

App Files Files Community

pojitha commited on Feb 16, 2023

Commit

5156c87

1 Parent(s): f7f9300

Upload 9 files

Browse files

Files changed (9) hide show

.gitattributes +4 -34
.github/workflows/main.yml +20 -0
Sinhala_Singlish_Hate_Speech.csv +0 -0
StopWords_425.txt +0 -0
Suffixes-413.txt +0 -0
app.py +121 -0
requirements.txt +0 -0
sinhala-hate-speech-dataset +3 -0
sinhala-hate-speech-dataset.csv +0 -0

.gitattributes CHANGED Viewed

@@ -1,34 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.csv filter=lfs diff=lfs merge=lfs -text
+sinhala-hate-speech-dataset.csv filter=lfs diff=lfs merge=lfs -text
+Sinhala_Singlish_Hate_Speech.csv filter=lfs diff=lfs merge=lfs -text
+sinhala-hate-speech-dataset filter=lfs diff=lfs merge=lfs -text

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://pojitha:[email protected]/spaces/pojitha/sinhalahatespeech main

Sinhala_Singlish_Hate_Speech.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

StopWords_425.txt ADDED Viewed

Binary file (9.2 kB). View file

Suffixes-413.txt ADDED Viewed

Binary file (5.32 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import numpy
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score
+import pandas as pd
+import numpy as np
+import streamlit as st
+df1 = pd.read_csv('sinhala-hate-speech-dataset.csv')
+df2 = pd.read_csv('Sinhala_Singlish_Hate_Speech.csv')
+df2.columns= ["id","comment","label"]
+df2['label'] = df2['label'].apply(lambda x: 1 if x == "YES" else 0)
+df = pd.concat([df1, df2], sort=False)
+df.isnull().sum()
+import re
+exclude = set(",.:;'\"-?!/´`%")
+def remove_punctutation(text):
+  return ''.join([(i if i not in exclude else " ") for i in text])
+def remove_numbers(text):
+  return ''.join(c for c in text if not c.isnumeric())
+df['clean_data'] = df['comment'].apply(lambda x: remove_punctutation((x)))
+df['cleand'] = df['clean_data'].apply(lambda x: remove_numbers(x))
+import nltk
+from nltk.tokenize import word_tokenize
+nltk.download('punkt')
+df['tokens'] = df['cleand'].apply(word_tokenize)
+with open("StopWords_425.txt", "r",encoding="utf-16") as file:
+    # Read the contents of the file
+    contents = file.read()
+stop_word = contents.split()
+stop_word = [word for word in stop_word if not any(char.isdigit() for char in word)]
+print(stop_word)
+df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stop_word])
+import nltk
+from nltk.tokenize import word_tokenize
+with open('Suffixes-413.txt', 'r', encoding='utf-16') as f:
+    stemmed_words = f.readlines()
+stemmed_words = [word for word in stemmed_words if not any(char.isdigit() for char in word)]
+stemmed_words = [word.strip() for word in stemmed_words]
+stemmed_words = set(stemmed_words)
+def stem_word(word):
+    if word in stemmed_words:
+        return word
+    else:
+        return nltk.stem.PorterStemmer().stem(word)
+df['cleaneddata'] = df['tokens'].apply(lambda x: [stem_word(word) for word in x])
+pipeline = Pipeline([
+    ('tfidf', TfidfVectorizer(stop_words=stop_word, token_pattern=r'\b\w+\b')),
+    ('svm', SVC())
+])
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
+pipeline.fit(X_train, y_train)
+st.title("Sinhala Hate Speech Detector")
+# Define the user input section
+user_input = st.text_input("Enter a sentence")
+# Define the model output section
+if user_input:
+    # Check if the sentence is hate or not
+    user_pred = pipeline.predict([user_input])[0]
+    if user_pred == 1:
+        st.write("This sentence is hate.")
+        add_to_df = st.selectbox("Is this correct?", ["Choose a Option","Yes", "No"],index=0)
+        if add_to_df == "Yes":
+            st.write("Thank you")
+        else:
+            processed_text = pd.Series(user_input)
+            df = df.append({'comment': user_input, 'label': 0}, ignore_index=True)
+            df.to_csv("sinhala-hate-speech-dataset", index=False)
+            X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
+            X_train = X_train.append(processed_text, ignore_index=True)
+            y_train = y_train.append(pd.Series([0]))
+            pipeline.fit(X_train, y_train)
+            st.write("Thank you for your contribution. We added that word into our system.")
+    else:
+        st.write("This sentence is not hate.")
+        add_to_df = st.selectbox("Is this correct?", ["Choose a Option","Yes", "No"],index=0)
+        if add_to_df == "Yes":
+            st.write("Thank you")
+        else:
+            processed_text = pd.Series(user_input)
+            df = df.append({'comment': user_input, 'label': 1}, ignore_index=True)
+            df.to_csv("sinhala-hate-speech-dataset.csv",index=True)
+            X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
+            X_train = X_train.append(processed_text, ignore_index=True)
+            y_train = y_train.append(pd.Series([1]))
+            pipeline.fit(X_train, y_train)
+            st.write("Thank you for your contribution. We added that word into our system.")

requirements.txt ADDED Viewed

Binary file (41.5 kB). View file

sinhala-hate-speech-dataset ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:372a6f64a4b68a8f5f820eac885dfa3526151acfab38dfb725d03f821de77c94
+size 12901950

sinhala-hate-speech-dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff