{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":11249653,"sourceType":"datasetVersion","datasetId":7029569}],"dockerImageVersionId":30918,"isInternetEnabled":false,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n# reading the two files \nexplicit= pd.read_excel('../input/sexually-explicit/harashment.xlsx')\nnon_explicit=pd.read_excel('../input/sexually-explicit/Normal comments.xlsx')","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:53.098846Z","iopub.execute_input":"2025-08-24T10:55:53.099134Z","iopub.status.idle":"2025-08-24T10:55:57.252994Z","shell.execute_reply.started":"2025-08-24T10:55:53.099112Z","shell.execute_reply":"2025-08-24T10:55:57.252218Z"}},"outputs":[],"execution_count":1},{"cell_type":"code","source":"#taking the two frames as one dataframe data(appending)\ndata = pd.concat([explicit, non_explicit], ignore_index=True)\n#data=data[['comments','label']]\ndata=data.sample(frac=1).reset_index(drop=True)\nprint(data.tail())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.253976Z","iopub.execute_input":"2025-08-24T10:55:57.254255Z","iopub.status.idle":"2025-08-24T10:55:57.265523Z","shell.execute_reply.started":"2025-08-24T10:55:57.254236Z","shell.execute_reply":"2025-08-24T10:55:57.264584Z"}},"outputs":[{"name":"stdout","text":" comments label\n34707 ከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ... 1\n34708 የሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል 1\n34709 ላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ... 0\n34710 አንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ... 1\n34711 ምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ... 1\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"#removing rows having an empty cells\ndata=data.dropna()\ndata.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.267103Z","iopub.execute_input":"2025-08-24T10:55:57.267320Z","iopub.status.idle":"2025-08-24T10:55:57.293026Z","shell.execute_reply.started":"2025-08-24T10:55:57.267301Z","shell.execute_reply":"2025-08-24T10:55:57.292317Z"}},"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(34710, 2)"},"metadata":{}}],"execution_count":3},{"cell_type":"code","source":"data['word_len'] = data['comments'].str.split().str.len()\nprint(data['word_len'].max())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.293975Z","iopub.execute_input":"2025-08-24T10:55:57.294216Z","iopub.status.idle":"2025-08-24T10:55:57.432867Z","shell.execute_reply.started":"2025-08-24T10:55:57.294197Z","shell.execute_reply":"2025-08-24T10:55:57.431855Z"}},"outputs":[{"name":"stdout","text":"618.0\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"# pre-processing of the dataset\n#normalizarion\nimport re\n#method to normalize character level missmatch such as ጸሀይ and ፀሐይ\ndef normalization(input_token):\n rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)\n rep2=re.sub('[ሑኁዅ]','ሁ',rep1)\n rep3=re.sub('[ኂሒኺ]','ሂ',rep2)\n rep4=re.sub('[ኌሔዄ]','ሄ',rep3)\n rep5=re.sub('[ሕኅ]','ህ',rep4)\n rep6=re.sub('[ኆሖኾ]','ሆ',rep5)\n rep7=re.sub('[ሠ]','ሰ',rep6)\n rep8=re.sub('[ሡ]','ሱ',rep7)\n rep9=re.sub('[ሢ]','ሲ',rep8)\n rep10=re.sub('[ሣ]','ሳ',rep9)\n rep11=re.sub('[ሤ]','ሴ',rep10)\n rep12=re.sub('[ሥ]','ስ',rep11)\n rep13=re.sub('[ሦ]','ሶ',rep12)\n rep14=re.sub('[ዓኣዐ]','አ',rep13)\n rep15=re.sub('[ዑ]','ኡ',rep14)\n rep16=re.sub('[ዒ]','ኢ',rep15)\n rep17=re.sub('[ዔ]','ኤ',rep16)\n rep18=re.sub('[ዕ]','እ',rep17)\n rep19=re.sub('[ዖ]','ኦ',rep18)\n rep20=re.sub('[ጸ]','ፀ',rep19)\n rep21=re.sub('[ጹ]','ፁ',rep20)\n rep22=re.sub('[ጺ]','ፂ',rep21)\n rep23=re.sub('[ጻ]','ፃ',rep22)\n rep24=re.sub('[ጼ]','ፄ',rep23)\n rep25=re.sub('[ጽ]','ፅ',rep24)\n rep26=re.sub('[ጾ]','ፆ',rep25)\n rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)\n rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)\n rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)\n rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)\n rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)\n rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)\n rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)\n rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)\n rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)\n rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)\n rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)\n rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)\n rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)\n rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)\n rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)\n rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)\n rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)\n rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)\n rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)\n rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)\n rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ\n rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ\n return rep48","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.433566Z","iopub.execute_input":"2025-08-24T10:55:57.433774Z","iopub.status.idle":"2025-08-24T10:55:57.463260Z","shell.execute_reply.started":"2025-08-24T10:55:57.433756Z","shell.execute_reply":"2025-08-24T10:55:57.462194Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"#normalization of Amharic characters having the same sound and meaning\ndata['comments'] = data['comments'].astype(str)\ndata['comments']=data['comments'].apply(lambda x: normalization(x))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.464227Z","iopub.execute_input":"2025-08-24T10:55:57.464763Z","iopub.status.idle":"2025-08-24T10:55:58.788656Z","shell.execute_reply.started":"2025-08-24T10:55:57.464718Z","shell.execute_reply":"2025-08-24T10:55:58.787981Z"}},"outputs":[],"execution_count":6},{"cell_type":"code","source":"#pre-processing function\ndef remove_non_amharic(text):\n # Define a regex pattern for Amharic characters (Unicode range)\n amharic_pattern = r'[^\\u1200-\\u137F\\s]' # Unicode range for Amharic characters\n amharic_pattern=r'[\\u1200-\\u137F]*[0-9]+[\\u1200-\\u137F]*'#removing digits\n amharic_pattern = r'[፡-፨]' # Unicode range for Amharic punctuation marks\n amharic_pattern=r'[^\\w\\s]'# removing white spaces\n # Use re.sub to replace non-Amharic characters with an empty string\n cleaned_text = re.sub(amharic_pattern, '', text)\n return cleaned_text","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:58.789529Z","iopub.execute_input":"2025-08-24T10:55:58.789781Z","iopub.status.idle":"2025-08-24T10:55:58.793993Z","shell.execute_reply.started":"2025-08-24T10:55:58.789760Z","shell.execute_reply":"2025-08-24T10:55:58.793062Z"}},"outputs":[],"execution_count":7},{"cell_type":"code","source":"# processing the data such as removing non-Amharic characters, digit,punctuation marks\ndata['comments']=data['comments'].apply(remove_non_amharic)\ndata.tail(20)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:58.796631Z","iopub.execute_input":"2025-08-24T10:55:58.796837Z","iopub.status.idle":"2025-08-24T10:55:58.905992Z","shell.execute_reply.started":"2025-08-24T10:55:58.796819Z","shell.execute_reply":"2025-08-24T10:55:58.905145Z"}},"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" comments label word_len\n34692 አንተ ጎበዝ ነህ 1 3.0\n34693 ከትምህርት እየተመለስኩ ነበር እና አንድ ልጅ በጣም በሆነ መንገድ ለመን... 1 12.0\n34694 ከሉሲ ጀርባ ያለው የሀያላኑ ሴራ ለምን በአፋር ተገኘች እውነታው ተጋለጠ vi 0 12.0\n34695 ጠቅላይ ሚኒስትር ዶክተር አብይ አህመድ በሰላም መቶ በመቶ ያሸልማል መርሀ... 0 13.0\n34696 ዘመዴ ወደው ነው የሚዘግቡት በኛ በጀት እየተቀሳቁሱ የኦርቶዶክስ ድምፅ ማ... 0 11.0\n34697 RIP አንድም ቀን ዝቅ ያላሉ በተናገሩ ቁጥር ልቤን ከፍቼ የማዳምጣቸው የ... 0 17.0\n34698 በወይኑ እና ቢራ ሱቅ አቅራቢያ በሰከሩ ሰዎች መጎርጎር እና አስተያየት መ... 1 12.0\n34699 በመንገድ ዳር ስሄድ ይህ ሰው ፍንዳታዬን መንካት ፈልጎ ነበር ነገር ግን ... 1 24.0\n34700 Vipi WKeny የባይደን አስተዳደር ከዚህ ቀደም ከነበሩት አስተዳደሮች ... 0 22.0\n34701 አንድ ልጅ በጣም ወጣት ስለነበር የግል ጓደኞቹን ብልጭ ድርግም ይለኛል 1 10.0\n34702 ሞሪስ በጣም ግዙፍ wanker 1 4.0\n34703 በጣም ምርጥ ንግግር ነው ወቅታዊ የኢትዮጵያን ችግር አስረድተዋል ምናለ ለ... 0 13.0\n34704 ደግም ፓስታ ለመስራት ውሀ ምን ያደርጋል እንደገና ልታስጀምረኝ ነዉ እንዴ 0 10.0\n34705 በጦርነት ቋንቋ በጁንታው ላይ የተደረገ ያለው የህግ ማስከበር እርምጃ ቱ ... 0 11.0\n34706 አንተ ደግሞ ምን አይነት ደደብ ነህ እኔኮ አስተዋይ ነበር የምትመስለኝ ሁ... 1 27.0\n34707 ከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ... 1 19.0\n34708 የሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል 1 8.0\n34709 ላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ... 0 28.0\n34710 አንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ... 1 16.0\n34711 ምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ... 1 16.0","text/html":"
| \n | comments | \nlabel | \nword_len | \n
|---|---|---|---|
| 34692 | \nአንተ ጎበዝ ነህ | \n1 | \n3.0 | \n
| 34693 | \nከትምህርት እየተመለስኩ ነበር እና አንድ ልጅ በጣም በሆነ መንገድ ለመን... | \n1 | \n12.0 | \n
| 34694 | \nከሉሲ ጀርባ ያለው የሀያላኑ ሴራ ለምን በአፋር ተገኘች እውነታው ተጋለጠ vi | \n0 | \n12.0 | \n
| 34695 | \nጠቅላይ ሚኒስትር ዶክተር አብይ አህመድ በሰላም መቶ በመቶ ያሸልማል መርሀ... | \n0 | \n13.0 | \n
| 34696 | \nዘመዴ ወደው ነው የሚዘግቡት በኛ በጀት እየተቀሳቁሱ የኦርቶዶክስ ድምፅ ማ... | \n0 | \n11.0 | \n
| 34697 | \nRIP አንድም ቀን ዝቅ ያላሉ በተናገሩ ቁጥር ልቤን ከፍቼ የማዳምጣቸው የ... | \n0 | \n17.0 | \n
| 34698 | \nበወይኑ እና ቢራ ሱቅ አቅራቢያ በሰከሩ ሰዎች መጎርጎር እና አስተያየት መ... | \n1 | \n12.0 | \n
| 34699 | \nበመንገድ ዳር ስሄድ ይህ ሰው ፍንዳታዬን መንካት ፈልጎ ነበር ነገር ግን ... | \n1 | \n24.0 | \n
| 34700 | \nVipi WKeny የባይደን አስተዳደር ከዚህ ቀደም ከነበሩት አስተዳደሮች ... | \n0 | \n22.0 | \n
| 34701 | \nአንድ ልጅ በጣም ወጣት ስለነበር የግል ጓደኞቹን ብልጭ ድርግም ይለኛል | \n1 | \n10.0 | \n
| 34702 | \nሞሪስ በጣም ግዙፍ wanker | \n1 | \n4.0 | \n
| 34703 | \nበጣም ምርጥ ንግግር ነው ወቅታዊ የኢትዮጵያን ችግር አስረድተዋል ምናለ ለ... | \n0 | \n13.0 | \n
| 34704 | \nደግም ፓስታ ለመስራት ውሀ ምን ያደርጋል እንደገና ልታስጀምረኝ ነዉ እንዴ | \n0 | \n10.0 | \n
| 34705 | \nበጦርነት ቋንቋ በጁንታው ላይ የተደረገ ያለው የህግ ማስከበር እርምጃ ቱ ... | \n0 | \n11.0 | \n
| 34706 | \nአንተ ደግሞ ምን አይነት ደደብ ነህ እኔኮ አስተዋይ ነበር የምትመስለኝ ሁ... | \n1 | \n27.0 | \n
| 34707 | \nከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ... | \n1 | \n19.0 | \n
| 34708 | \nየሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል | \n1 | \n8.0 | \n
| 34709 | \nላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ... | \n0 | \n28.0 | \n
| 34710 | \nአንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ... | \n1 | \n16.0 | \n
| 34711 | \nምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ... | \n1 | \n16.0 | \n
| Epoch | \nTraining Loss | \nValidation Loss | \nPrecision | \nRecall | \nF1 | \nAccuracy | \n
|---|---|---|---|---|---|---|
| 1 | \n0.370300 | \n0.168155 | \n0.947850 | \n0.947065 | \n0.947058 | \n0.947065 | \n
| 2 | \n0.216800 | \n0.095888 | \n0.979873 | \n0.979834 | \n0.979835 | \n0.979834 | \n
| 3 | \n0.129200 | \n0.093063 | \n0.985258 | \n0.985236 | \n0.985236 | \n0.985236 | \n
| 4 | \n0.080500 | \n0.087753 | \n0.986735 | \n0.986676 | \n0.986677 | \n0.986676 | \n
"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f97aacb645704040a27c01086800cd7e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cfee8ff9408444189d4a5dae61068382"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"444fbdc793a840a592cfc9f7f0e3c07b"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"83564cfa99e2493ba097d12850b14967"}},"metadata":{}},{"name":"stderr","text":"/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n","output_type":"stream"}],"execution_count":38},{"cell_type":"code","source":"import matplotlib.pyplot as plt\n\n# Get the log history from the trainer\ntrain_loss = trainer.state.log_history\n\n# Initialize lists for epochs, training losses, and validation losses\nepochs = []\ntrain_losses = []\nval_losses = []\n\n# Extract training and validation losses safely\nfor entry in train_loss:\n if 'loss' in entry:\n epochs.append(entry['epoch'])\n train_losses.append(entry['loss'])\n if 'eval_loss' in entry: # Check for validation loss\n val_losses.append(entry['eval_loss'])\n\n# Check if we have losses to plot\nif train_losses:\n plt.figure(figsize=(10, 6))\n plt.plot(epochs, train_losses, label='Training Loss', color='blue')\n \n # Plot validation loss if available\n if val_losses:\n plt.plot(epochs[:len(val_losses)], val_losses, label='Validation Loss', color='orange')\n \n plt.title('Training and Validation Loss Over Epochs')\n plt.xlabel('Epochs')\n plt.ylabel('Loss')\n plt.grid()\n plt.legend()\n plt.show()\nelse:\n print(\"No training loss data available.\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:13:27.384798Z","iopub.execute_input":"2025-08-24T14:13:27.385141Z","iopub.status.idle":"2025-08-24T14:13:27.587831Z","shell.execute_reply.started":"2025-08-24T14:13:27.385118Z","shell.execute_reply":"2025-08-24T14:13:27.586894Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"