{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":11249653,"sourceType":"datasetVersion","datasetId":7029569}],"dockerImageVersionId":30918,"isInternetEnabled":false,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n# reading the two files \nexplicit= pd.read_excel('../input/sexually-explicit/harashment.xlsx')\nnon_explicit=pd.read_excel('../input/sexually-explicit/Normal comments.xlsx')","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:53.098846Z","iopub.execute_input":"2025-08-24T10:55:53.099134Z","iopub.status.idle":"2025-08-24T10:55:57.252994Z","shell.execute_reply.started":"2025-08-24T10:55:53.099112Z","shell.execute_reply":"2025-08-24T10:55:57.252218Z"}},"outputs":[],"execution_count":1},{"cell_type":"code","source":"#taking the two frames as one dataframe data(appending)\ndata = pd.concat([explicit, non_explicit], ignore_index=True)\n#data=data[['comments','label']]\ndata=data.sample(frac=1).reset_index(drop=True)\nprint(data.tail())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.253976Z","iopub.execute_input":"2025-08-24T10:55:57.254255Z","iopub.status.idle":"2025-08-24T10:55:57.265523Z","shell.execute_reply.started":"2025-08-24T10:55:57.254236Z","shell.execute_reply":"2025-08-24T10:55:57.264584Z"}},"outputs":[{"name":"stdout","text":" comments label\n34707 ከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ... 1\n34708 የሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል 1\n34709 ላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ... 0\n34710 አንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ... 1\n34711 ምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ... 1\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"#removing rows having an empty cells\ndata=data.dropna()\ndata.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.267103Z","iopub.execute_input":"2025-08-24T10:55:57.267320Z","iopub.status.idle":"2025-08-24T10:55:57.293026Z","shell.execute_reply.started":"2025-08-24T10:55:57.267301Z","shell.execute_reply":"2025-08-24T10:55:57.292317Z"}},"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(34710, 2)"},"metadata":{}}],"execution_count":3},{"cell_type":"code","source":"data['word_len'] = data['comments'].str.split().str.len()\nprint(data['word_len'].max())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.293975Z","iopub.execute_input":"2025-08-24T10:55:57.294216Z","iopub.status.idle":"2025-08-24T10:55:57.432867Z","shell.execute_reply.started":"2025-08-24T10:55:57.294197Z","shell.execute_reply":"2025-08-24T10:55:57.431855Z"}},"outputs":[{"name":"stdout","text":"618.0\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"# pre-processing of the dataset\n#normalizarion\nimport re\n#method to normalize character level missmatch such as ጸሀይ and ፀሐይ\ndef normalization(input_token):\n rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)\n rep2=re.sub('[ሑኁዅ]','ሁ',rep1)\n rep3=re.sub('[ኂሒኺ]','ሂ',rep2)\n rep4=re.sub('[ኌሔዄ]','ሄ',rep3)\n rep5=re.sub('[ሕኅ]','ህ',rep4)\n rep6=re.sub('[ኆሖኾ]','ሆ',rep5)\n rep7=re.sub('[ሠ]','ሰ',rep6)\n rep8=re.sub('[ሡ]','ሱ',rep7)\n rep9=re.sub('[ሢ]','ሲ',rep8)\n rep10=re.sub('[ሣ]','ሳ',rep9)\n rep11=re.sub('[ሤ]','ሴ',rep10)\n rep12=re.sub('[ሥ]','ስ',rep11)\n rep13=re.sub('[ሦ]','ሶ',rep12)\n rep14=re.sub('[ዓኣዐ]','አ',rep13)\n rep15=re.sub('[ዑ]','ኡ',rep14)\n rep16=re.sub('[ዒ]','ኢ',rep15)\n rep17=re.sub('[ዔ]','ኤ',rep16)\n rep18=re.sub('[ዕ]','እ',rep17)\n rep19=re.sub('[ዖ]','ኦ',rep18)\n rep20=re.sub('[ጸ]','ፀ',rep19)\n rep21=re.sub('[ጹ]','ፁ',rep20)\n rep22=re.sub('[ጺ]','ፂ',rep21)\n rep23=re.sub('[ጻ]','ፃ',rep22)\n rep24=re.sub('[ጼ]','ፄ',rep23)\n rep25=re.sub('[ጽ]','ፅ',rep24)\n rep26=re.sub('[ጾ]','ፆ',rep25)\n rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)\n rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)\n rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)\n rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)\n rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)\n rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)\n rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)\n rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)\n rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)\n rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)\n rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)\n rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)\n rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)\n rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)\n rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)\n rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)\n rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)\n rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)\n rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)\n rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)\n rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ\n rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ\n return rep48","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.433566Z","iopub.execute_input":"2025-08-24T10:55:57.433774Z","iopub.status.idle":"2025-08-24T10:55:57.463260Z","shell.execute_reply.started":"2025-08-24T10:55:57.433756Z","shell.execute_reply":"2025-08-24T10:55:57.462194Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"#normalization of Amharic characters having the same sound and meaning\ndata['comments'] = data['comments'].astype(str)\ndata['comments']=data['comments'].apply(lambda x: normalization(x))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:57.464227Z","iopub.execute_input":"2025-08-24T10:55:57.464763Z","iopub.status.idle":"2025-08-24T10:55:58.788656Z","shell.execute_reply.started":"2025-08-24T10:55:57.464718Z","shell.execute_reply":"2025-08-24T10:55:58.787981Z"}},"outputs":[],"execution_count":6},{"cell_type":"code","source":"#pre-processing function\ndef remove_non_amharic(text):\n # Define a regex pattern for Amharic characters (Unicode range)\n amharic_pattern = r'[^\\u1200-\\u137F\\s]' # Unicode range for Amharic characters\n amharic_pattern=r'[\\u1200-\\u137F]*[0-9]+[\\u1200-\\u137F]*'#removing digits\n amharic_pattern = r'[፡-፨]' # Unicode range for Amharic punctuation marks\n amharic_pattern=r'[^\\w\\s]'# removing white spaces\n # Use re.sub to replace non-Amharic characters with an empty string\n cleaned_text = re.sub(amharic_pattern, '', text)\n return cleaned_text","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:58.789529Z","iopub.execute_input":"2025-08-24T10:55:58.789781Z","iopub.status.idle":"2025-08-24T10:55:58.793993Z","shell.execute_reply.started":"2025-08-24T10:55:58.789760Z","shell.execute_reply":"2025-08-24T10:55:58.793062Z"}},"outputs":[],"execution_count":7},{"cell_type":"code","source":"# processing the data such as removing non-Amharic characters, digit,punctuation marks\ndata['comments']=data['comments'].apply(remove_non_amharic)\ndata.tail(20)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:58.796631Z","iopub.execute_input":"2025-08-24T10:55:58.796837Z","iopub.status.idle":"2025-08-24T10:55:58.905992Z","shell.execute_reply.started":"2025-08-24T10:55:58.796819Z","shell.execute_reply":"2025-08-24T10:55:58.905145Z"}},"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" comments label word_len\n34692 አንተ ጎበዝ ነህ 1 3.0\n34693 ከትምህርት እየተመለስኩ ነበር እና አንድ ልጅ በጣም በሆነ መንገድ ለመን... 1 12.0\n34694 ከሉሲ ጀርባ ያለው የሀያላኑ ሴራ ለምን በአፋር ተገኘች እውነታው ተጋለጠ vi 0 12.0\n34695 ጠቅላይ ሚኒስትር ዶክተር አብይ አህመድ በሰላም መቶ በመቶ ያሸልማል መርሀ... 0 13.0\n34696 ዘመዴ ወደው ነው የሚዘግቡት በኛ በጀት እየተቀሳቁሱ የኦርቶዶክስ ድምፅ ማ... 0 11.0\n34697 RIP አንድም ቀን ዝቅ ያላሉ በተናገሩ ቁጥር ልቤን ከፍቼ የማዳምጣቸው የ... 0 17.0\n34698 በወይኑ እና ቢራ ሱቅ አቅራቢያ በሰከሩ ሰዎች መጎርጎር እና አስተያየት መ... 1 12.0\n34699 በመንገድ ዳር ስሄድ ይህ ሰው ፍንዳታዬን መንካት ፈልጎ ነበር ነገር ግን ... 1 24.0\n34700 Vipi WKeny የባይደን አስተዳደር ከዚህ ቀደም ከነበሩት አስተዳደሮች ... 0 22.0\n34701 አንድ ልጅ በጣም ወጣት ስለነበር የግል ጓደኞቹን ብልጭ ድርግም ይለኛል 1 10.0\n34702 ሞሪስ በጣም ግዙፍ wanker 1 4.0\n34703 በጣም ምርጥ ንግግር ነው ወቅታዊ የኢትዮጵያን ችግር አስረድተዋል ምናለ ለ... 0 13.0\n34704 ደግም ፓስታ ለመስራት ውሀ ምን ያደርጋል እንደገና ልታስጀምረኝ ነዉ እንዴ 0 10.0\n34705 በጦርነት ቋንቋ በጁንታው ላይ የተደረገ ያለው የህግ ማስከበር እርምጃ ቱ ... 0 11.0\n34706 አንተ ደግሞ ምን አይነት ደደብ ነህ እኔኮ አስተዋይ ነበር የምትመስለኝ ሁ... 1 27.0\n34707 ከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ... 1 19.0\n34708 የሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል 1 8.0\n34709 ላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ... 0 28.0\n34710 አንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ... 1 16.0\n34711 ምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ... 1 16.0","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
commentslabelword_len
34692አንተ ጎበዝ ነህ13.0
34693ከትምህርት እየተመለስኩ ነበር እና አንድ ልጅ በጣም በሆነ መንገድ ለመን...112.0
34694ከሉሲ ጀርባ ያለው የሀያላኑ ሴራ ለምን በአፋር ተገኘች እውነታው ተጋለጠ vi012.0
34695ጠቅላይ ሚኒስትር ዶክተር አብይ አህመድ በሰላም መቶ በመቶ ያሸልማል መርሀ...013.0
34696ዘመዴ ወደው ነው የሚዘግቡት በኛ በጀት እየተቀሳቁሱ የኦርቶዶክስ ድምፅ ማ...011.0
34697RIP አንድም ቀን ዝቅ ያላሉ በተናገሩ ቁጥር ልቤን ከፍቼ የማዳምጣቸው የ...017.0
34698በወይኑ እና ቢራ ሱቅ አቅራቢያ በሰከሩ ሰዎች መጎርጎር እና አስተያየት መ...112.0
34699በመንገድ ዳር ስሄድ ይህ ሰው ፍንዳታዬን መንካት ፈልጎ ነበር ነገር ግን ...124.0
34700Vipi WKeny የባይደን አስተዳደር ከዚህ ቀደም ከነበሩት አስተዳደሮች ...022.0
34701አንድ ልጅ በጣም ወጣት ስለነበር የግል ጓደኞቹን ብልጭ ድርግም ይለኛል110.0
34702ሞሪስ በጣም ግዙፍ wanker14.0
34703በጣም ምርጥ ንግግር ነው ወቅታዊ የኢትዮጵያን ችግር አስረድተዋል ምናለ ለ...013.0
34704ደግም ፓስታ ለመስራት ውሀ ምን ያደርጋል እንደገና ልታስጀምረኝ ነዉ እንዴ010.0
34705በጦርነት ቋንቋ በጁንታው ላይ የተደረገ ያለው የህግ ማስከበር እርምጃ ቱ ...011.0
34706አንተ ደግሞ ምን አይነት ደደብ ነህ እኔኮ አስተዋይ ነበር የምትመስለኝ ሁ...127.0
34707ከእለታት አንድ ቀን አንዲት ልጅ ወደ ቡዳ እየመጣች ነበር እና ብዙ ወንዶ...119.0
34708የሴቶች ትልቁ ችግር ያወራቸው ያያቸው ሁሉ ሚወዳቸው ይመስላቸዋል18.0
34709ላይ የኢትዮጵያ ሏላዊ ስልጣን ባለቤት የኢትዮጵያ ህዝብ ነው አንድ የኢትዮ...028.0
34710አንዲት ልጅ ከትምህርት ቤት እየመጣች ሳለ አንድ ባለሱቅ ደውላ ወደ ሱቅ ...116.0
34711ምሽት ላይ በአውቶቡስ ማቆሚያ ላይ ቆሜ ነበር አንዳንድ ወንዶች ልጆች የስ...116.0
\n
"},"metadata":{}}],"execution_count":8},{"cell_type":"code","source":"!pip install lime","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:55:58.907208Z","iopub.execute_input":"2025-08-24T10:55:58.907405Z","iopub.status.idle":"2025-08-24T10:56:02.288523Z","shell.execute_reply.started":"2025-08-24T10:55:58.907388Z","shell.execute_reply":"2025-08-24T10:56:02.287677Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: lime in /usr/local/lib/python3.10/dist-packages (0.2.0.1)\nRequirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from lime) (3.7.5)\nRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from lime) (1.26.4)\nRequirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from lime) (1.13.1)\nRequirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from lime) (4.67.1)\nRequirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.10/dist-packages (from lime) (1.2.2)\nRequirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.10/dist-packages (from lime) (0.25.0)\nRequirement already satisfied: networkx>=3.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (3.4.2)\nRequirement already satisfied: pillow>=10.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (11.0.0)\nRequirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2.36.1)\nRequirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2024.12.12)\nRequirement already satisfied: packaging>=21 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (24.2)\nRequirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (0.4)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy->lime) (2.4.1)\nRequirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (1.4.2)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (3.5.0)\nRequirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.3.1)\nRequirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (0.12.1)\nRequirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (4.55.3)\nRequirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.4.7)\nRequirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (3.2.0)\nRequirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (2.9.0.post0)\nRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->lime) (1.17.0)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->lime) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->lime) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy->lime) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy->lime) (2024.2.0)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy->lime) (2024.2.0)\n","output_type":"stream"}],"execution_count":9},{"cell_type":"code","source":"#importing important packages \nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import make_pipeline\nfrom lime.lime_text import LimeTextExplainer","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:02.289618Z","iopub.execute_input":"2025-08-24T10:56:02.289861Z","iopub.status.idle":"2025-08-24T10:56:02.780110Z","shell.execute_reply.started":"2025-08-24T10:56:02.289840Z","shell.execute_reply":"2025-08-24T10:56:02.779407Z"}},"outputs":[],"execution_count":10},{"cell_type":"code","source":"#spliting the dataset into training and testing set\ntrain_val_df, test_dataset = train_test_split(data, test_size=0.20, random_state=42)\ntrain_dataset, evaluation_dataset = train_test_split(train_val_df, test_size=0.10, random_state=42)\nprint('Training dataset shape: ', train_dataset.shape)\nprint('Validation dataset shape: ', evaluation_dataset.shape)\nprint('Testing dataset shape: ', test_dataset.shape)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:02.780965Z","iopub.execute_input":"2025-08-24T10:56:02.781310Z","iopub.status.idle":"2025-08-24T10:56:02.798062Z","shell.execute_reply.started":"2025-08-24T10:56:02.781289Z","shell.execute_reply":"2025-08-24T10:56:02.797295Z"}},"outputs":[{"name":"stdout","text":"Training dataset shape: (24991, 3)\nValidation dataset shape: (2777, 3)\nTesting dataset shape: (6942, 3)\n","output_type":"stream"}],"execution_count":11},{"cell_type":"code","source":"msk = np.random.rand(len(data)) < 0.8\ntrain_dataset = data[msk]\ntest_dataset = data[~msk]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:02.798944Z","iopub.execute_input":"2025-08-24T10:56:02.799147Z","iopub.status.idle":"2025-08-24T10:56:02.805986Z","shell.execute_reply.started":"2025-08-24T10:56:02.799131Z","shell.execute_reply":"2025-08-24T10:56:02.805346Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":" pip install datasets","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:02.806751Z","iopub.execute_input":"2025-08-24T10:56:02.807006Z","iopub.status.idle":"2025-08-24T10:56:06.321605Z","shell.execute_reply.started":"2025-08-24T10:56:02.806976Z","shell.execute_reply":"2025-08-24T10:56:06.320517Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.3.1)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.17.0)\nRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\nRequirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (19.0.1)\nRequirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\nRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.3)\nRequirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\nRequirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.67.1)\nRequirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\nRequirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\nRequirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.12.0)\nRequirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.12)\nRequirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.34.4)\nRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\nRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\nRequirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.6)\nRequirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.2)\nRequirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (5.0.1)\nRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (25.1.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\nRequirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.1)\nRequirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.18.3)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.12.2)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.1.8)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->datasets) (2.4.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2025.1.31)\nRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2025.1)\nRequirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2025.1)\nRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->datasets) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->datasets) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy>=1.17->datasets) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy>=1.17->datasets) (2024.2.0)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy>=1.17->datasets) (2024.2.0)\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":13},{"cell_type":"code","source":"from datasets import Dataset\n#convert format of the dataset to HuggingFace Dataset from Pandas DataFrame\ntrain_dataset=Dataset.from_pandas(train_dataset)\ntest_dataset=Dataset.from_pandas(test_dataset)\nevaluation_dataset=Dataset.from_pandas(evaluation_dataset)\nprint(test_dataset)\nprint(train_dataset)\nprint(evaluation_dataset)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:06.322822Z","iopub.execute_input":"2025-08-24T10:56:06.323209Z","iopub.status.idle":"2025-08-24T10:56:07.011981Z","shell.execute_reply.started":"2025-08-24T10:56:06.323167Z","shell.execute_reply":"2025-08-24T10:56:07.011191Z"}},"outputs":[{"name":"stdout","text":"Dataset({\n features: ['comments', 'label', 'word_len', '__index_level_0__'],\n num_rows: 7017\n})\nDataset({\n features: ['comments', 'label', 'word_len', '__index_level_0__'],\n num_rows: 27693\n})\nDataset({\n features: ['comments', 'label', 'word_len', '__index_level_0__'],\n num_rows: 2777\n})\n","output_type":"stream"}],"execution_count":14},{"cell_type":"code","source":"#remove unnecessary column\ntest_dataset=test_dataset.remove_columns(\"__index_level_0__\")\ntrain_dataset=train_dataset.remove_columns(\"__index_level_0__\")\nevaluation_dataset=evaluation_dataset.remove_columns(\"__index_level_0__\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:07.012843Z","iopub.execute_input":"2025-08-24T10:56:07.013374Z","iopub.status.idle":"2025-08-24T10:56:07.020167Z","shell.execute_reply.started":"2025-08-24T10:56:07.013340Z","shell.execute_reply":"2025-08-24T10:56:07.019299Z"}},"outputs":[],"execution_count":15},{"cell_type":"code","source":"#combine the train and test dataset into one datset\nimport datasets\nmain_dataset= datasets.DatasetDict({\n 'train': train_dataset,\n 'test': test_dataset,\n 'evaluate': evaluation_dataset\n})","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:07.021038Z","iopub.execute_input":"2025-08-24T10:56:07.021237Z","iopub.status.idle":"2025-08-24T10:56:07.041293Z","shell.execute_reply.started":"2025-08-24T10:56:07.021219Z","shell.execute_reply":"2025-08-24T10:56:07.040454Z"}},"outputs":[],"execution_count":16},{"cell_type":"code","source":"# training and testing data size\ntraining_data_size = main_dataset['train'].num_rows\ntesting_data_size = main_dataset['test'].num_rows\nevaluation_data_size = maain_dataset['evaluate'].num_rows","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:07.042193Z","iopub.execute_input":"2025-08-24T10:56:07.042406Z","iopub.status.idle":"2025-08-24T10:56:07.059518Z","shell.execute_reply.started":"2025-08-24T10:56:07.042388Z","shell.execute_reply":"2025-08-24T10:56:07.058694Z"}},"outputs":[],"execution_count":17},{"cell_type":"code","source":"from huggingface_hub import snapshot_download\nsnapshot_download(repo_id=\"devaprobs/hate-speech-detection-using-amharic-language\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:07.060270Z","iopub.execute_input":"2025-08-24T10:56:07.060516Z","iopub.status.idle":"2025-08-24T10:56:07.212934Z","shell.execute_reply.started":"2025-08-24T10:56:07.060487Z","shell.execute_reply":"2025-08-24T10:56:07.212072Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"Fetching 8 files: 0%| | 0/8 [00:00=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.2)\nRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\nRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.11.6)\nRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\nRequirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.21.0)\nRequirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\nRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.67.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2024.12.0)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (1.1.8)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2.4.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2025.1.31)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->transformers) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->transformers) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy>=1.17->transformers) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy>=1.17->transformers) (2024.2.0)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy>=1.17->transformers) (2024.2.0)\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":28},{"cell_type":"code","source":"pip install --upgrade accelerate\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:39.177537Z","iopub.execute_input":"2025-08-24T10:56:39.177881Z","iopub.status.idle":"2025-08-24T10:56:42.698598Z","shell.execute_reply.started":"2025-08-24T10:56:39.177845Z","shell.execute_reply":"2025-08-24T10:56:42.697549Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (1.10.0)\nRequirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (24.2)\nRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\nRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.2)\nRequirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.5.1+cu121)\nRequirement already satisfied: huggingface_hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.34.4)\nRequirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.4.5)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (3.17.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (2024.12.0)\nRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (2.32.3)\nRequirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (4.67.1)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (4.12.2)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.21.0->accelerate) (1.1.8)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy<3.0.0,>=1.17->accelerate) (2.4.1)\nRequirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->accelerate) (3.4.2)\nRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->accelerate) (3.1.4)\nRequirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->accelerate) (1.13.1)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=2.0.0->accelerate) (1.3.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<3.0.0,>=1.17->accelerate) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<3.0.0,>=1.17->accelerate) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy<3.0.0,>=1.17->accelerate) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy<3.0.0,>=1.17->accelerate) (2024.2.0)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.21.0->accelerate) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.21.0->accelerate) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.21.0->accelerate) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.21.0->accelerate) (2025.1.31)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy<3.0.0,>=1.17->accelerate) (2024.2.0)\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":29},{"cell_type":"code","source":"#Load auto mode classifier from the pretrained model\nfrom transformers import AutoModelForSequenceClassification\nmodel = AutoModelForSequenceClassification.from_pretrained(\"devaprobs/hate-speech-detection-using-amharic-language\", num_labels=2)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:42.699665Z","iopub.execute_input":"2025-08-24T10:56:42.699913Z","iopub.status.idle":"2025-08-24T10:56:42.856541Z","shell.execute_reply.started":"2025-08-24T10:56:42.699892Z","shell.execute_reply":"2025-08-24T10:56:42.855893Z"}},"outputs":[],"execution_count":30},{"cell_type":"code","source":"!pip install evaluate","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:42.857466Z","iopub.execute_input":"2025-08-24T10:56:42.857686Z","iopub.status.idle":"2025-08-24T10:56:46.330522Z","shell.execute_reply.started":"2025-08-24T10:56:42.857668Z","shell.execute_reply":"2025-08-24T10:56:46.329390Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (0.4.5)\nRequirement already satisfied: datasets>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from evaluate) (3.3.1)\nRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from evaluate) (1.26.4)\nRequirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.3.8)\nRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from evaluate) (2.2.3)\nRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from evaluate) (2.32.3)\nRequirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from evaluate) (4.67.1)\nRequirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from evaluate) (3.5.0)\nRequirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.70.16)\nRequirement already satisfied: fsspec>=2021.05.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]>=2021.05.0->evaluate) (2024.12.0)\nRequirement already satisfied: huggingface-hub>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.34.4)\nRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from evaluate) (24.2)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->evaluate) (3.17.0)\nRequirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->evaluate) (19.0.1)\nRequirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->evaluate) (3.11.12)\nRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->evaluate) (6.0.2)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.7.0->evaluate) (4.12.2)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.7.0->evaluate) (1.1.8)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->evaluate) (2.4.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->evaluate) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->evaluate) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->evaluate) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->evaluate) (2025.1.31)\nRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->evaluate) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->evaluate) (2025.1)\nRequirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->evaluate) (2025.1)\nRequirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (2.4.6)\nRequirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (1.3.2)\nRequirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (5.0.1)\nRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (25.1.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (1.5.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (6.1.0)\nRequirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (0.2.1)\nRequirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->evaluate) (1.18.3)\nRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->evaluate) (1.17.0)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->evaluate) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->evaluate) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy>=1.17->evaluate) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy>=1.17->evaluate) (2024.2.0)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy>=1.17->evaluate) (2024.2.0)\n","output_type":"stream"}],"execution_count":31},{"cell_type":"code","source":"!pip install tensorflow\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:46.331738Z","iopub.execute_input":"2025-08-24T10:56:46.332030Z","iopub.status.idle":"2025-08-24T10:56:49.772001Z","shell.execute_reply.started":"2025-08-24T10:56:46.332007Z","shell.execute_reply":"2025-08-24T10:56:49.771100Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: tensorflow in /usr/local/lib/python3.10/dist-packages (2.17.1)\nRequirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.4.0)\nRequirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.6.3)\nRequirement already satisfied: flatbuffers>=24.3.25 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.3.25)\nRequirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.6.0)\nRequirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\nRequirement already satisfied: h5py>=3.10.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.12.1)\nRequirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (18.1.1)\nRequirement already satisfied: ml-dtypes<0.5.0,>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.4.1)\nRequirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.4.0)\nRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.2)\nRequirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.20.3)\nRequirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.32.3)\nRequirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow) (75.1.0)\nRequirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.17.0)\nRequirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.5.0)\nRequirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.12.2)\nRequirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.17.0)\nRequirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.68.1)\nRequirement already satisfied: tensorboard<2.18,>=2.17 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.17.1)\nRequirement already satisfied: keras>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.5.0)\nRequirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.37.1)\nRequirement already satisfied: numpy<2.0.0,>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.26.4)\nRequirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow) (0.45.1)\nRequirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (13.9.4)\nRequirement already satisfied: namex in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (0.0.8)\nRequirement already satisfied: optree in /usr/local/lib/python3.10/dist-packages (from keras>=3.2.0->tensorflow) (0.13.1)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy<2.0.0,>=1.23.5->tensorflow) (2.4.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow) (2025.1.31)\nRequirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (3.7)\nRequirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (0.7.2)\nRequirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.18,>=2.17->tensorflow) (3.1.3)\nRequirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow) (3.0.2)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0.0,>=1.23.5->tensorflow) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0.0,>=1.23.5->tensorflow) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy<2.0.0,>=1.23.5->tensorflow) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy<2.0.0,>=1.23.5->tensorflow) (2024.2.0)\nRequirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.2.0->tensorflow) (3.0.0)\nRequirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.2.0->tensorflow) (2.19.1)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy<2.0.0,>=1.23.5->tensorflow) (2024.2.0)\nRequirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow) (0.1.2)\n","output_type":"stream"}],"execution_count":32},{"cell_type":"code","source":"pip install --upgrade transformers\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:49.773079Z","iopub.execute_input":"2025-08-24T10:56:49.773411Z","iopub.status.idle":"2025-08-24T10:56:53.359830Z","shell.execute_reply.started":"2025-08-24T10:56:49.773381Z","shell.execute_reply":"2025-08-24T10:56:53.358721Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.55.4)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.17.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.34.4)\nRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.2)\nRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\nRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.11.6)\nRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\nRequirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.21.0)\nRequirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\nRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.67.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2024.12.0)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.12.2)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.8)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2025.0.1)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2022.0.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy>=1.17->transformers) (2.4.1)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.1)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.3.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2025.1.31)\nRequirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->transformers) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy>=1.17->transformers) (2022.0.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy>=1.17->transformers) (1.2.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy>=1.17->transformers) (2024.2.0)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy>=1.17->transformers) (2024.2.0)\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":33},{"cell_type":"code","source":"# Instead of importing everything at once\nfrom transformers.training_args import TrainingArguments\nfrom transformers.trainer import Trainer\nfrom transformers import EarlyStoppingCallback\nfrom transformers import EarlyStoppingCallback, IntervalStrategy","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:53.361211Z","iopub.execute_input":"2025-08-24T10:56:53.361605Z","iopub.status.idle":"2025-08-24T10:56:53.674286Z","shell.execute_reply.started":"2025-08-24T10:56:53.361569Z","shell.execute_reply":"2025-08-24T10:56:53.673315Z"}},"outputs":[],"execution_count":34},{"cell_type":"code","source":"from evaluate import load\ndef custom_metrics(eval_pred):\n metric1 = load(\"precision\")\n metric2 = load(\"recall\")\n metric3 = load(\"f1\")\n metric4 = load(\"accuracy\")\n\n logits, labels = eval_pred\n predictions = np.argmax(logits, axis=-1)\n\n precision = metric1.compute(predictions=predictions, references=labels, average=\"weighted\")[\"precision\"]\n recall = metric2.compute(predictions=predictions, references=labels, average=\"weighted\")[\"recall\"]\n f1 = metric3.compute(predictions=predictions, references=labels, average=\"weighted\")[\"f1\"]\n accuracy = metric4.compute(predictions=predictions, references=labels)[\"accuracy\"]\n\n return {\"precision\": precision, \"recall\": recall, \"f1\": f1, \"accuracy\": accuracy}","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:53.675279Z","iopub.execute_input":"2025-08-24T10:56:53.675620Z","iopub.status.idle":"2025-08-24T10:56:53.761665Z","shell.execute_reply.started":"2025-08-24T10:56:53.675589Z","shell.execute_reply":"2025-08-24T10:56:53.760844Z"}},"outputs":[],"execution_count":35},{"cell_type":"code","source":"# #load an optimizer\n# optimizer = AdamW(model.parameters(), lr=5e-5)\ntraining_args = TrainingArguments(\n output_dir=\"./results\",\n eval_strategy='epoch',\n save_strategy='epoch',\n logging_strategy='epoch',\n num_train_epochs=4,\n learning_rate=1e-5,\n per_device_train_batch_size=4, # batch size per device during training\n per_device_eval_batch_size=4, # batch size for evaluation\n warmup_steps=1000, # number of warmup steps for learning rate\n weight_decay=0.01,\n run_name=\"sexually explicit comments\",# strength of weight decay\n logging_dir='./logs', # directory for storing logs\n logging_steps=20,\n report_to=\"none\",\n load_best_model_at_end= True,\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:53.762515Z","iopub.execute_input":"2025-08-24T10:56:53.762797Z","iopub.status.idle":"2025-08-24T10:56:53.885802Z","shell.execute_reply.started":"2025-08-24T10:56:53.762767Z","shell.execute_reply":"2025-08-24T10:56:53.885042Z"}},"outputs":[],"execution_count":36},{"cell_type":"code","source":"trainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=small_train_dataset,\n eval_dataset=small_eval_dataset,\n compute_metrics=custom_metrics,\n callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:53.886682Z","iopub.execute_input":"2025-08-24T10:56:53.886911Z","iopub.status.idle":"2025-08-24T10:56:54.265190Z","shell.execute_reply.started":"2025-08-24T10:56:53.886893Z","shell.execute_reply":"2025-08-24T10:56:54.264253Z"}},"outputs":[],"execution_count":37},{"cell_type":"code","source":"train_result=trainer.train()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T10:56:54.266096Z","iopub.execute_input":"2025-08-24T10:56:54.266330Z","iopub.status.idle":"2025-08-24T13:17:19.681697Z","shell.execute_reply.started":"2025-08-24T10:56:54.266310Z","shell.execute_reply":"2025-08-24T13:17:19.680822Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"","text/html":"\n
\n \n \n [13848/13848 2:20:23, Epoch 4/4]\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
EpochTraining LossValidation LossPrecisionRecallF1Accuracy
10.3703000.1681550.9478500.9470650.9470580.947065
20.2168000.0958880.9798730.9798340.9798350.979834
30.1292000.0930630.9852580.9852360.9852360.985236
40.0805000.0877530.9867350.9866760.9866770.986676

"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f97aacb645704040a27c01086800cd7e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cfee8ff9408444189d4a5dae61068382"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"444fbdc793a840a592cfc9f7f0e3c07b"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading builder script: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"83564cfa99e2493ba097d12850b14967"}},"metadata":{}},{"name":"stderr","text":"/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n","output_type":"stream"}],"execution_count":38},{"cell_type":"code","source":"import matplotlib.pyplot as plt\n\n# Get the log history from the trainer\ntrain_loss = trainer.state.log_history\n\n# Initialize lists for epochs, training losses, and validation losses\nepochs = []\ntrain_losses = []\nval_losses = []\n\n# Extract training and validation losses safely\nfor entry in train_loss:\n if 'loss' in entry:\n epochs.append(entry['epoch'])\n train_losses.append(entry['loss'])\n if 'eval_loss' in entry: # Check for validation loss\n val_losses.append(entry['eval_loss'])\n\n# Check if we have losses to plot\nif train_losses:\n plt.figure(figsize=(10, 6))\n plt.plot(epochs, train_losses, label='Training Loss', color='blue')\n \n # Plot validation loss if available\n if val_losses:\n plt.plot(epochs[:len(val_losses)], val_losses, label='Validation Loss', color='orange')\n \n plt.title('Training and Validation Loss Over Epochs')\n plt.xlabel('Epochs')\n plt.ylabel('Loss')\n plt.grid()\n plt.legend()\n plt.show()\nelse:\n print(\"No training loss data available.\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:13:27.384798Z","iopub.execute_input":"2025-08-24T14:13:27.385141Z","iopub.status.idle":"2025-08-24T14:13:27.587831Z","shell.execute_reply.started":"2025-08-24T14:13:27.385118Z","shell.execute_reply":"2025-08-24T14:13:27.586894Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"

","image/png":"\n"},"metadata":{}}],"execution_count":61},{"cell_type":"code","source":"from sklearn.metrics import confusion_matrix\n# Now you can make predictions\npredictions = trainer.predict(small_test_dataset)\n# Get the predicted labels and true labels\npredicted_labels = np.argmax(predictions.predictions, axis=1)\ntrue_labels = small_test_dataset['label']\n# Calculate the confusion matrix\ncm = confusion_matrix(true_labels, predicted_labels)\nprint(cm)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T13:17:19.682606Z","iopub.execute_input":"2025-08-24T13:17:19.682886Z","iopub.status.idle":"2025-08-24T13:20:11.389640Z","shell.execute_reply.started":"2025-08-24T13:17:19.682855Z","shell.execute_reply":"2025-08-24T13:20:11.388810Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"","text/html":""},"metadata":{}},{"name":"stdout","text":"[[3308 210]\n [ 179 3320]]\n","output_type":"stream"}],"execution_count":39},{"cell_type":"code","source":"from sklearn.metrics import classification_report\nprint(classification_report(true_labels, predicted_labels))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T13:20:11.390537Z","iopub.execute_input":"2025-08-24T13:20:11.390877Z","iopub.status.idle":"2025-08-24T13:20:11.404363Z","shell.execute_reply.started":"2025-08-24T13:20:11.390845Z","shell.execute_reply":"2025-08-24T13:20:11.403691Z"}},"outputs":[{"name":"stdout","text":" precision recall f1-score support\n\n 0 0.95 0.94 0.94 3518\n 1 0.94 0.95 0.94 3499\n\n accuracy 0.94 7017\n macro avg 0.94 0.94 0.94 7017\nweighted avg 0.94 0.94 0.94 7017\n\n","output_type":"stream"}],"execution_count":40},{"cell_type":"code","source":"# Decode the input_ids to get the actual text\ndecoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in small_test_dataset['input_ids']]\n\n# Find misclassified indices\nmisclassified_indices = np.where(np.array(predicted_labels) != np.array(true_labels))[0]\n\n# Create a DataFrame for misclassified instances\nmisclassified_samples = pd.DataFrame({\n 'text': np.array(decoded_texts)[misclassified_indices],\n 'true_label': np.array(true_labels)[misclassified_indices],\n 'predicted_label': np.array(predicted_labels)[misclassified_indices]\n})\n\n# Display the misclassified samples\nprint(misclassified_samples)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:22:53.744273Z","iopub.execute_input":"2025-08-24T14:22:53.744630Z","iopub.status.idle":"2025-08-24T14:22:54.532368Z","shell.execute_reply.started":"2025-08-24T14:22:53.744602Z","shell.execute_reply":"2025-08-24T14:22:54.531509Z"}},"outputs":[{"name":"stdout","text":" text true_label \\\n0 የባለአደራው ምክር ቤት የፖለቲካ አጀንዳ እንደሌለውና የፖለቲካ ድርጅትም ... 1 \n1 የሰለቸኝ ምን እንደሆነ ታውቃለህ የአንተ እና የግርማ ሰይፉ አሉባልታ 0 \n2 ጎዳኸኝ ከእንግዲህ ጓደኛህ መሆን አልፈልግም 0 \n3 አንዱ ሰፈር crdiologist ነኝብሎ ሲሰክስ ቆይቶ ካርድ ክፍል እደሚሰ... 0 \n4 እንግዲህ በአንድ በደል ምክንያት ፍርድ ለኩነኔ ወደ ሰው ሁሉ እንደ መጣ ... 0 \n.. ... ... \n384 አስቸጋሪው የህብረተሰብ ክፍል 0 \n385 ሁልት ምርጥ ወንድሞች 0 \n386 የሰው ልጅ በታሪክ ውስጥ ወሳኝ ደረጃ ላይ እያለፈ ነው ታላቅ ትግል የእግ... 0 \n387 ጅል በምን ያሸንፋል ቢሉት እምቢ በማለት ላለመሸነፍ ግግም ብሎ እምቢ ማለት 1 \n388 የግብረ ሰዶማውያን መሪ ያምላ የግብረ ሰዶማውያን ኒገሮች ሳተርን ጁፒተር ... 1 \n\n predicted_label \n0 0 \n1 1 \n2 1 \n3 1 \n4 1 \n.. ... \n384 1 \n385 1 \n386 1 \n387 0 \n388 0 \n\n[389 rows x 3 columns]\n","output_type":"stream"}],"execution_count":63},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# Define parameters\ninitial_accuracy = 0.93\ntest_size = 6942\nsplit_ratios = [0.8, 0.9] # 80-20 split and 90-10 split\n\n# Function to calculate misclassifications\ndef calculate_misclassifications(accuracy, test_size):\n return test_size * (1 - accuracy)\n\n# Create a DataFrame to store results\nresults = pd.DataFrame(columns=['Split Ratio', 'Initial Misclassifications', 'Improved Misclassifications', 'Accuracy Increase (%)', 'Reduction in Misclassifications'])\n\nresults_list = []\n\n# Iterate through accuracy increases\nfor accuracy_increase in np.linspace(0.005, 0.01, num=3): # 0.5% to 1% increase\n improved_accuracy = initial_accuracy + accuracy_increase\n for split_ratio in split_ratios:\n split_size = int(test_size * split_ratio)\n \n initial_misclassifications = calculate_misclassifications(initial_accuracy, split_size)\n improved_misclassifications = calculate_misclassifications(improved_accuracy, split_size)\n \n reduction = initial_misclassifications - improved_misclassifications\n \n # Append results as a dictionary to the list\n results_list.append({\n 'Split Ratio': f'{int(split_ratio*100)}-10',\n 'Initial Misclassifications': initial_misclassifications,\n 'Improved Misclassifications': improved_misclassifications,\n 'Accuracy Increase (%)': accuracy_increase * 100,\n 'Reduction in Misclassifications': reduction\n })\n\n# Convert the list of results to a DataFrame\nresults = pd.DataFrame(results_list)\n\n# Display results\nprint(results)\n# Visualization\nplt.figure(figsize=(10, 6))\nfor split_ratio in split_ratios:\n subset = results[results['Split Ratio'] == f'{int(split_ratio * 100)}-10']\n plt.plot(subset['Accuracy Increase (%)'], subset['Reduction in Misclassifications'], marker='o', label=f'Split Ratio {int(split_ratio * 100)}-10')\n\nplt.title('Reduction in Misclassifications vs. Accuracy Increase')\nplt.xlabel('Accuracy Increase (%)')\nplt.ylabel('Reduction in Misclassifications')\nplt.xticks(np.arange(0.5, 1.1, 0.1))\nplt.grid()\nplt.legend()\nplt.show()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:25:11.966104Z","iopub.execute_input":"2025-08-24T14:25:11.966419Z","iopub.status.idle":"2025-08-24T14:25:12.180335Z","shell.execute_reply.started":"2025-08-24T14:25:11.966397Z","shell.execute_reply":"2025-08-24T14:25:12.179559Z"}},"outputs":[{"name":"stdout","text":" Split Ratio Initial Misclassifications Improved Misclassifications \\\n0 80-10 388.71 360.9450 \n1 90-10 437.29 406.0550 \n2 80-10 388.71 347.0625 \n3 90-10 437.29 390.4375 \n4 80-10 388.71 333.1800 \n5 90-10 437.29 374.8200 \n\n Accuracy Increase (%) Reduction in Misclassifications \n0 0.50 27.7650 \n1 0.50 31.2350 \n2 0.75 41.6475 \n3 0.75 46.8525 \n4 1.00 55.5300 \n5 1.00 62.4700 \n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{}}],"execution_count":64},{"cell_type":"code","source":"# Step 1: Decode the input_ids to get the actual text\ntext_data = [tokenizer.decode(ids, skip_special_tokens=True) for ids in small_train_dataset['input_ids']]\n\n# Step 2: Pass the list of strings to CountVectorizer\n# Step 2: Pass the list of strings to TfidfVectorizer\nvectorizer = TfidfVectorizer()\nX_train_vectorized = vectorizer.fit_transform(text_data)\nfeature_names = vectorizer.get_feature_names_out()\n\n# Step 3: Sum the TF-IDF scores for each feature\nsum_tfidf = X_train_vectorized.sum(axis=0)\n\n# Convert sum_tfidf to a 1D array\nsum_tfidf = np.asarray(sum_tfidf).flatten()\n\n# Step 4: Create a DataFrame for better visualization\nfeatures_df = pd.DataFrame({'Feature': feature_names, 'Importance': sum_tfidf})\nfeatures_df = features_df.sort_values(by='Importance', ascending=False)\n# Optional: Display the top N features\ntop_n = 100 # Adjust this number as needed\nprint(f\"\\nTop {top_n} features:\")\nprint(features_df.head(top_n))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T15:22:36.150205Z","iopub.execute_input":"2025-08-24T15:22:36.150518Z","iopub.status.idle":"2025-08-24T15:22:39.895701Z","shell.execute_reply.started":"2025-08-24T15:22:36.150493Z","shell.execute_reply":"2025-08-24T15:22:39.894752Z"}},"outputs":[{"name":"stdout","text":"\nTop 100 features:\n Feature Importance\n54256 እና 702.031664\n40691 ነው 619.564413\n7802 ላይ 540.325107\n40366 ነበር 495.401991\n45453 አስተያየት 451.623216\n... ... ...\n58755 እኛ 64.458666\n89397 ደስ 64.348601\n17387 ሲሰጥ 63.803839\n28973 በእኔ 63.604761\n61637 እፈልጋለሁ 62.880520\n\n[100 rows x 2 columns]\n","output_type":"stream"}],"execution_count":83},{"cell_type":"code","source":"for example in small_train_dataset[:5]: # Adjust the slice as needed\n print(example)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:42:31.924134Z","iopub.execute_input":"2025-08-24T14:42:31.924458Z","iopub.status.idle":"2025-08-24T14:42:31.931871Z","shell.execute_reply.started":"2025-08-24T14:42:31.924435Z","shell.execute_reply":"2025-08-24T14:42:31.931191Z"}},"outputs":[{"name":"stdout","text":"label\nword_len\ninput_ids\ntoken_type_ids\nattention_mask\n","output_type":"stream"}],"execution_count":72},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"#Analyzing Absolute vs. Borderline Classifications\nprobabilities = predictions.predictions.max(axis=1)\nborderline_indices = np.where((probabilities > 0.4) & (probabilities < 0.6))[0]\n\n# Visualizing Borderline Classifications\nborderline_samples = small_test_dataset[borderline_indices]\nborderline_labels = predicted_labels[borderline_indices]\n\nplt.figure(figsize=(10, 6))\nplt.hist(probabilities[borderline_indices], bins=10, color='orange', alpha=0.7)\nplt.axvline(x=0.5, color='red', linestyle='--', label='Threshold')\nplt.title('Probability Distribution of Borderline Classifications')\nplt.xlabel('Probability')\nplt.ylabel('Frequency')\nplt.legend()\nplt.show()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T14:00:13.638788Z","iopub.execute_input":"2025-08-24T14:00:13.639125Z","iopub.status.idle":"2025-08-24T14:00:13.843386Z","shell.execute_reply.started":"2025-08-24T14:00:13.639103Z","shell.execute_reply":"2025-08-24T14:00:13.842511Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{}}],"execution_count":57},{"cell_type":"code","source":"# Function to make predictions\n# Move model to GPU if available\n# Move model to GPU if available\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nmodel.to(device)\n# Function to make predictions\ndef predict(texts):\n # Tokenize the texts and move to the device\n inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)\n\n with torch.no_grad():\n outputs = model(**inputs)\n\n # Get the logits (raw outputs) from the model\n logits = outputs.logits\n\n # Apply softmax to get probabilities\n probabilities = torch.nn.functional.softmax(logits, dim=1)\n\n # Return the probabilities as a NumPy array on the CPU\n return probabilities.cpu().numpy() \n\n# Example explanation using LIME\nfrom lime.lime_text import LimeTextExplainer\n\n# Initialize the explainer\nexplainer = LimeTextExplainer(class_names=['0', '1'])\n\n# Example dataset (replace with your actual dataset)\nsmall_train_dataset1 = [\n \"ባየሁህ ቁጥር አብረን ልንሰራው የምንችለውን ሁሉንም ባለጌ ነገር መገመት አያቅተኝም\",\n \"በጣም የማይቋቋሙት ነሽ ሁለታችንም ከእንግዲህ ልንወስደው እስካልቻልን ድረስ እያንዳንዳችሁን ኢንች ማሰስ እፈልጋለሁ\",\n \"ሁልጊዜ ባየሁህ ጊዜ የእኔ ቀን ወዲያውኑ ይሻሻላል; በእኔ ላይ ያን አስደናቂ ተፅእኖ አለህ!\",\n \"በጣም የሚያምር ፈገግታ አለህ ክፍሉን በሙሉ ያበራል\"\n \n]\n\n# Generate explanations\nfor i in range(4): # Adjust the range for more examples\n exp = explainer.explain_instance(small_train_dataset1[i], \n predict,\n num_features=10)\n exp.show_in_notebook(text=True)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T15:57:21.539517Z","iopub.execute_input":"2025-08-24T15:57:21.539864Z","iopub.status.idle":"2025-08-24T15:57:42.451105Z","shell.execute_reply.started":"2025-08-24T15:57:21.539840Z","shell.execute_reply":"2025-08-24T15:57:42.450057Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"","text/html":"\n \n \n
\n \n \n "},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"\n \n \n
\n \n \n "},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"\n \n \n
\n \n \n "},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"\n \n \n
\n \n \n "},"metadata":{}}],"execution_count":91},{"cell_type":"code","source":"import lime\nimport lime.lime_text\ndata = [\n\"በእኔ ላይ ምን ያህል ጥሩ ስሜት እንደሚሰማዎት ማሰብ ማቆም አልችልም ሌሊቱን ሙሉ እንዲሰማኝ እፈልጋለሁ\",\n\"ያንቺ ልብስ ለብሰሽ ያለሽ ሀሳብ ብቻ ዱርዬ ገፋፋኝ እንደገና ለማየት መጠበቅ አልችልም\",\n\"ስሜን በሹክሹክታ የምትናገሩበትን መንገድ ወድጄዋለሁ የሚያቃስቱዎትን ነገሮች ሁሉ ለማድረግ እንድፈልግ ያደርገኛል\",\n\"ባየሁህ ቁጥር አብረን ልንሰራው የምንችለውን ሁሉንም ባለጌ ነገር መገመት አያቅተኝም\",\n\"በጣም የማይቋቋሙት ነሽ ሁለታችንም ከእንግዲህ ልንወስደው እስካልቻልን ድረስ እያንዳንዳችሁን ኢንች ማሰስ እፈልጋለሁ\",\n\"ሁልጊዜ ባየሁህ ጊዜ የእኔ ቀን ወዲያውኑ ይሻሻላል; በእኔ ላይ ያን አስደናቂ ተፅእኖ አለህ!\",\n\"በጣም የሚያምር ፈገግታ አለህ ክፍሉን በሙሉ ያበራል\",\n\"የምትስቁበትን መንገድ ወድጄዋለሁ ተላላፊ ነው እና አብሬ መሳቅ እንድፈልግ ያደርገኛል\",\n\"እንደ ንጹህ አየር እስትንፋስ ነዎት በዙሪያዎ መሆን በጣም የሚያድስ ስሜት ይሰማዎታል\",\n\"ስለ አንተ ባሰብኩ ጊዜ ፈገግ ከማለት በቀር ምንም ማድረግ አልችልም፣ ልቤን እንዴት እንደምታስደስት ታውቃለህ!\",\n]\nlabels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]\n# Create a pandas DataFrame\ndf = pd.DataFrame({\"text\": data, \"label\": labels})\nsample_text = df[\"text\"][0]\nsample_label = df[\"label\"][0]\n\ndef predictor(texts):\n trainer.model.eval()\n inputs = tokenizer(texts, return_tensors=\"pt\", padding=True, truncation=True).to('cuda')\n with torch.no_grad():\n print(next(trainer.model.parameters()).device)\n outputs = trainer.model(**inputs).logits\n return torch.nn.functional.softmax(outputs, dim=-1).cpu().numpy()\n\nexplainer_lime = lime.lime_text.LimeTextExplainer(class_names=['0', '1'])\n\n# Ensure model is on cuda and set attn_implementation\ntrainer.model = trainer.model.to(\"cuda\")\ntrainer.model.config.attn_implementation = \"eager\"\n\ndef get_attention(text):\n inputs = tokenizer(text, return_tensors=\"pt\").to('cuda')\n outputs = trainer.model(**inputs, output_attentions=True)\n attention = outputs.attentions\n return attention\n\n# Loop through all data points and print the explanations\nfor index, row in df.iterrows():\n sample_text = row[\"text\"]\n sample_label = row[\"label\"]\n\n explanation = explainer_lime.explain_instance(\n sample_text,\n classifier_fn=predictor,\n )\n print(f\"Explanation for sample {index}:\")\n print(dict(explanation.as_list()))\n print(\"-\" * 40)\n\n attention_values = get_attention(sample_text)\n print(\"Attention values for first layer, first head, first token:\")\n print(attention_values[0][0][0][0])\n print(\"=\" * 40)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T13:20:11.696692Z","iopub.status.idle":"2025-08-24T13:20:11.697081Z","shell.execute_reply":"2025-08-24T13:20:11.696916Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import matplotlib.pyplot as plt\nfrom sklearn.metrics import ConfusionMatrixDisplay\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm)\ndisp.plot()\nplt.show()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T13:20:11.697956Z","iopub.status.idle":"2025-08-24T13:20:11.698329Z","shell.execute_reply":"2025-08-24T13:20:11.698164Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# prenting each values of the metriics used \nfrom sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score\nprint(accuracy_score(true_labels, predicted_labels))\nprint(precision_score(true_labels, predicted_labels))\nprint(recall_score(true_labels, predicted_labels))\nprint(f1_score(true_labels, predicted_labels))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-24T13:20:11.699251Z","iopub.status.idle":"2025-08-24T13:20:11.699642Z","shell.execute_reply":"2025-08-24T13:20:11.699460Z"}},"outputs":[],"execution_count":null}]}