{ "metadata": { "kernelspec": { "name": "python", "display_name": "Python (Pyodide)", "language": "python" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8" } }, "nbformat_minor": 5, "nbformat": 4, "cells": [ { "id": "dcabe7fe", "cell_type": "code", "source": "!pip install normalize_text", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Ignoring invalid distribution -ensim (c:\\users\\moner\\anaconda3\\lib\\site-packages)\n", "ERROR: Could not find a version that satisfies the requirement normalize_text (from versions: none)\n", "ERROR: No matching distribution found for normalize_text\n", "\n", "[notice] A new release of pip is available: 23.1.2 -> 23.2.1\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] } ], "execution_count": 1 }, { "id": "61a20a1b", "cell_type": "code", "source": "import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport re\nimport nltk\nfrom nltk.stem.isri import ISRIStemmer\nimport string\nimport matplotlib.pyplot as plt\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler\nfrom sklearn import svm\nfrom sklearn.model_selection import train_test_split, cross_val_score, KFold\nfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\nfrom sklearn.metrics import classification_report, confusion_matrix\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.svm import SVC\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn import metrics\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nfrom sklearn.naive_bayes import ComplementNB, MultinomialNB\nfrom tashaphyne.stemming import ArabicLightStemmer\nimport pyarabic.araby as araby\nimport unicodedata\n\n\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.utils.class_weight import compute_class_weight\nnltk.download('stopwords')\nnltk.download('punkt')\n# read csv file and save it in a data frame\ndata_df= pd.read_csv(r\"C:\\Users\\moner\\Downloads\\cleaned_thefinaldataset monerah.csv\")\nimport pyarabic.araby as araby\ndata_df.head()\n#Drop any rows with missing values in the LABEL\ndata_df.dropna(subset=['LABEL'], inplace=True)", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\moner\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\moner\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "execution_count": 34 }, { "id": "4862523e", "cell_type": "code", "source": "data_df['LABEL'].value_counts()", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3 3389\n", "2 2067\n", "0 1755\n", "1 97\n", "Name: LABEL, dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 35 }, { "id": "71f8d462", "cell_type": "code", "source": "# Drop any duplicate rows based on the tweet column\ndata_df.drop_duplicates(keep='first', inplace=True)\ndata_df.drop_duplicates(subset='tweet', keep='first', inplace=True)", "metadata": {}, "outputs": [], "execution_count": 36 }, { "id": "7f171fef", "cell_type": "code", "source": "data_df['tweet']", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 @TRTArabi اولاً الله يوفقهم ان شاءالله\\r\\nوثان...\n", "1 @llms_ma ألاحظ شكثر الحريم اللي يمشون على ممشى...\n", "2 @saam___99 حلو بما اننه عجبتش الرياضة استمري ف...\n", "3 @fahadlghofaili طيب يا غبي مصارعة الحريم للحري...\n", "4 اذا مافيه إلا هاالعلاج مالك الاتصبرين كم شهر ...\n", " ... \n", "7303 احب اقولك بعيد عن شعارتكم الز..يف هي البنت هي ...\n", "7304 \\nوالجريدة ما حصلت غير سوالف عن الحريم ورياضته...\n", "7305 اتفق جدا، مع تقديري واحترامي الشديد لكل العابر...\n", "7306 ومن قالك ان الرياضة مخصصه لرجال \\nحتى الحرمة ا...\n", "7307 يعني مسوي انك قوي ويالله على المطبخ الحرمه مكا...\n", "Name: tweet, Length: 7308, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 37 }, { "id": "b97696f5", "cell_type": "code", "source": "my_emojis = {\n \"🙂\":\"يبتسم\",\n \"😂\":\"يضحك\",\n \"💔\":\"قلب حزين\",\n \"🙂\":\"يبتسم\",\n \"🤭\":\"حياء\", \n \"❤️\":\"حب\",\n \"❤\":\"حب\",\n \"😍\":\"حب\",\n \"😭\":\"يضحك\",\n \"😢\":\"حزن\",\n \"😔\":\"حزن\", \n \"♥\":\"حب\",\n \"💜\":\"حب\",\n \"😅\":\"يضحك\",\n \"🙁\":\"حزين\",\n \"💕\":\"حب\",\n \"💙\":\"حب\",\n \"😞\":\"حزين\",\n \"😊\":\"سعادة\",\n \"👏\":\"يصفق\",\n \"👌\":\"احسنت\",\n \"😴\":\"ينام\",\n \"😀\":\"يضحك\",\n \"😌\":\"مرتاح\",\n \"🌹\":\"وردة\",\n \"🙈\":\"حب\",\n \"😄\":\"يضحك\",\n \"😐\":\"محايد\",\n \"✌\":\"منتصر\",\n \"✨\":\"نجمه\",\n \"🤔\":\"تفكير\",\n \"😏\":\"يستهزء\",\n \"😒\":\"يستهزء\",\n \"🙄\":\"ملل\",\n \"😕\":\"عصبية\",\n \"😃\":\"يضحك\",\n \"🌸\":\"وردة\",\n \"😓\":\"حزن\",\n \"💞\":\"حب\",\n \"💗\":\"حب\",\n \"😑\":\"منزعج\",\n \"💭\":\"تفكير\",\n \"😎\":\"ثقة\",\n \"💛\":\"حب\",\n \"😩\":\"حزين\",\n \"💪\":\"عضلات\",\n \"👍\":\"موافق\",\n \"🙏🏻\":\"رجاء طلب\",\n \"😳\":\"مصدوم\",\n \"👏🏼\":\"تصفيق\",\n \"🎶\":\"موسيقي\",\n \"🌚\":\"صمت\",\n \"💚\":\"حب\",\n \"🙏\":\"رجاء طلب\",\n \"💘\":\"حب\",\n \"🍃\":\"سلام\",\n \"☺\":\"يضحك\",\n \"🐸\":\"ضفدع\",\n \"😶\":\"مصدوم\",\n \"✌️\":\"مرح\",\n \"✋🏻\":\"توقف\",\n \"😉\":\"غمزة\",\n \"🌷\":\"حب\",\n \"🙃\":\"مبتسم\",\n \"😫\":\"حزين\",\n \"😨\":\"مصدوم\",\n \"🎼 \":\"موسيقي\",\n \"🍁\":\"مرح\",\n \"🍂\":\"مرح\",\n \"💟\":\"حب\",\n \"😪\":\"حزن\",\n \"😆\":\"يضحك\",\n \"😣\":\"استياء\",\n \"☺️\":\"حب\",\n \"😱\":\"كارثة\",\n \"😁\":\"يضحك\",\n \"😖\":\"استياء\",\n \"🏃🏼\":\"يجري\",\n \"😡\":\"غضب\",\n \"🚶\":\"يسير\",\n \"🤕\":\"مرض\",\n \"‼️\":\"تعجب\",\n \"🕊\":\"طائر\",\n \"👌🏻\":\"احسنت\",\n \"❣\":\"حب\",\n \"🙊\":\"مصدوم\",\n \"💃\":\"سعادة مرح\",\n \"💃🏼\":\"سعادة مرح\",\n \"😜\":\"مرح\",\n \"👊\":\"ضربة\",\n \"😟\":\"استياء\",\n \"💖\":\"حب\",\n \"😥\":\"حزن\",\n \"🎻\":\"موسيقي\",\n \"✒\":\"يكتب\",\n \"🚶🏻\":\"يسير\",\n \"💎\":\"الماس\",\n \"😷\":\"وباء مرض\",\n \"☝\":\"واحد\",\n \"🚬\":\"تدخين\",\n \"💐\" : \"ورد\",\n \"🌞\" : \"شمس\",\n \"👆\" : \"الاول\",\n \"⚠️\" :\"تحذير\",\n \"🤗\" : \"احتواء\",\n \"✖️\": \"غلط\",\n \"📍\" : \"مكان\", \n \"👸\" : \"ملكه\",\n \"👑\" : \"تاج\",\n \"✔️\" : \"صح\",\n \"💌\": \"قلب\",\n \"😲\" : \"مندهش\",\n \"💦\": \"ماء\",\n \"🚫\" : \"خطا\",\n \"👏🏻\" : \"برافو\",\n \"🏊\" :\"يسبح\",\n \"👍🏻\": \"تمام\",\n \"⭕️\" :\"دائره كبيره\",\n \"🎷\" : \"ساكسفون\",\n \"👋\": \"تلويح باليد\",\n \"✌🏼\": \"علامه النصر\",\n \"🌝\":\"مبتسم\",\n \"➿\" : \"عقده مزدوجه\",\n \"💪🏼\" : \"قوي\",\n \"📩\": \"تواصل معي\",\n \"☕️\": \"قهوه\",\n \"😧\" : \"قلق و صدمة\",\n \"🗨\": \"رسالة\", \n \"❗️\" :\"تعجب\",\n \"🙆🏻\": \"اشاره موافقه\",\n \"👯\" :\"اخوات\",\n \"©\" : \"رمز\",\n \"👵🏽\" :\"سيده عجوزه\",\n \"🐣\": \"كتكوت\", \n \"🙌\": \"تشجيع\",\n \"🙇\": \"شخص ينحني\",\n \"👐🏽\":\"ايدي مفتوحه\", \n \"👌🏽\": \"بالظبط\",\n \"⁉️\" : \"استنكار\",\n \"⚽️\": \"كوره\",\n \"🕶\" :\"حب\",\n \"🎈\" :\"بالون\", \n \"🎀\": \"ورده\",\n \"💵\": \"فلوس\", \n \"😋\": \"جائع\",\n \"😛\": \"يغيظ\",\n \"😠\": \"غاضب\",\n \"✍🏻\": \"يكتب\",\n \"🌾\": \"ارز\",\n \"👣\": \"اثر قدمين\",\n \"❌\":\"رفض\",\n \"🍟\":\"طعام\",\n \"👬\":\"صداقة\",\n \"🐰\":\"ارنب\", \n \"☂\":\"مطر\",\n \"⚜\":\"مملكة فرنسا\",\n \"🐑\":\"خروف\",\n \"🗣\":\"صوت مرتفع\",\n \"👌🏼\":\"احسنت\",\n \"☘\":\"مرح\",\n \"😮\":\"صدمة\",\n \"😦\":\"قلق\",\n \"⭕\":\"الحق\",\n \"✏️\":\"قلم\",\n \"ℹ\":\"معلومات\",\n \"🙍🏻\":\"رفض\",\n \"⚪️\":\"نضارة نقاء\",\n \"🐤\":\"حزن\",\n \"💫\":\"مرح\",\n \"💝\":\"حب\",\n \"🍔\":\"طعام\",\n \"❤︎\":\"حب\",\n \"✈️\":\"سفر\",\n \"🏃🏻♀️\":\"يسير\",\n \"🍳\":\"ذكر\",\n \"🎤\":\"مايك غناء\",\n \"🎾\":\"كره\",\n \"🐔\":\"دجاجة\",\n \"🙋\":\"سؤال\",\n \"💉\":\"دواء\",\n \"🙏🏼\":\"رجاء طلب\",\n \"💂🏿 \":\"حارس\",\n \"🎬\":\"سينما\",\n \"♦️\":\"مرح\",\n \"💡\":\"قكرة\",\n \"‼\":\"تعجب\",\n \"👼\":\"طفل\",\n \"🔑\":\"مفتاح\",\n \"♥️\":\"حب\",\n \"🕋\":\"كعبة\",\n \"🐓\":\"دجاجة\",\n \"💩\":\"معترض\",\n \"👽\":\"فضائي\",\n \"☔️\":\"مطر\",\n \"🍷\":\"عصير\",\n \"🌟\":\"نجمة\",\n \"☁️\":\"سحب\",\n \"👃\":\"معترض\",\n \"🌺\":\"مرح\",\n \"🔪\":\"سكينة\",\n \"♨\":\"سخونية\",\n \"👊🏼\":\"ضرب\",\n \"✏\":\"قلم\",\n \"🚶🏾♀️\":\"يسير\",\n \"👊\":\"ضربة\",\n \"◾️\":\"وقف\",\n \"😚\":\"حب\",\n \"🔸\":\"مرح\",\n \"👎🏻\":\"لا يعجبني\",\n \"👊🏽\":\"ضربة\",\n \"😙\":\"حب\",\n \"🎥\":\"تصوير\",\n \"👉\":\"جذب انتباه\",\n \"👏🏽\":\"يصفق\",\n \"💪🏻\":\"عضلات\",\n \"🏴\":\"اسود\",\n \"🔥\":\"حريق\", \n \"😬\":\"عدم الراحة\", \n \"👊🏿\":\"يضرب\", \n \"🌿\":\"ورقه شجره\", \n \"✋🏼\":\"كف ايد\", \n \"👐\":\"ايدي مفتوحه\", \n \"☠️\":\"وجه مرعب\", \n \"🎉\":\"يهنئ\", \n \"🔕\" :\"صامت\",\n \"😿\":\"وجه حزين\", \n \"☹️\":\"وجه يائس\",\n \"😘\" :\"حب\", \n \"😰\" :\"خوف و حزن\",\n \"🌼\":\"ورده\", \n \"💋\": \"بوسه\",\n \"👇\":\"لاسفل\", \n \"❣️\":\"حب\", \n \"🎧\":\"سماعات\",\n \"📝\":\"يكتب\", \n \"😇\":\"ملاك\", \n \"😈\":\"رعب\", \n \"🏃\":\"يجري\", \n \"✌🏻\":\"علامه النصر\", \n \"🔫\":\"يضرب\", \n \"❗️\":\"تعجب\",\n \"👎\":\"غير موافق\", \n \"🔐\":\"قفل\", \n \"👈\":\"لليمين\",\n \"™\":\"رمز\", \n \"🚶🏽\":\"يتمشي\", \n \"😯\":\"متفاجأ\", \n \"✊\":\"يد مغلقه\", \n \"😻\":\"اعجاب\", \n \"🙉\" :\"قرد\", \n \"👧\":\"طفله صغيره\", \n \"🔴\":\"دائره حمراء\", \n \"🏽\":\"قوه\", \n \"💤\":\"ينام\", \n \"👀\":\"ينظر\", \n \"✍🏻\":\"يكتب\", \n \"❄️\":\"تلج\",\n \"💀\":\"رعب\", \n \"😤\":\"وجه عابس\", \n \"🖋\":\"قلم\", \n \"🎩\":\"كاب\", \n \"☕️\":\"قهوه\", \n \"😹\":\"ضحك\", \n \"💓\":\"حب\", \n \"☄️ \":\"نار\", \n \"👻\":\"رعب\",\n \"🤣\":\"ضحك\",\n }", "metadata": {}, "outputs": [], "execution_count": 38 }, { "id": "59ae2dee", "cell_type": "code", "source": "# Remove special characters\narabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:\"؟.,'{}~¦+|!”…“–ـ'''\nenglish_punctuations = string.punctuation\npunctuations_list = arabic_punctuations + english_punctuations\n\ndef remove_punctuations(text):\n if isinstance(text, str):\n # Replace underscore (_) with a space before removing other punctuations\n text = text.replace('_', ' ')\n translator = str.maketrans('', '', punctuations_list)\n text = re.sub(r'[!\\\"#$%&\\'()*+,-./:;<=>?@[\\\\]^`{|}~]', ' ', text)\n return text.strip() # Remove leading and trailing spaces\n else:\n return text # Return the input value if it's not a string\n# Tokenization\n\ndef tokenize_tweet(tweet):\n if isinstance(tweet, str):\n # Tokenize the tweet into individual words\n tokens = nltk.word_tokenize(tweet)\n return tokens\n else:\n return []\n\ndef remove_stop_words(text):\n stop_words = set(stopwords.words('arabic'))\n words = word_tokenize(text)\n words = [word for word in words if word not in stop_words and word.isalpha()]\n return \" \".join(words)\n\ndef remove_non_arabic(text):\n return re.sub(r'[^\\u0600-\\u06FF\\s]', '', text)\n\n# stemming\ndef stem_tokens(tokens):\n stemmer = ArabicLightStemmer()\n stemmed = [stemmer.light_stem(token) for token in tokens]\n return ' '.join(stemmed)\n\n# Normalization\ndef normalize_arabic(text):\n text = text.strip()\n# text = re.sub('[إأآ]', 'ا', text) # Normalize Alef characters\n# text = re.sub('[ٱٲٳإ]', 'ا', text) # Normalize special Alef characters\n\n# text = re.sub('[ة]', 'ه', text) # Normalize Ta Marbuta characters to Ha\n\n\n\n\n # Normalize Arabic text by removing diacritics and normalizing characters\n text = unicodedata.normalize('NFKD', text)\n text = re.sub(r'[\\u0610-\\u061A\\u064B-\\u0652\\u06D6-\\u06DC\\u06DF\\u06E0\\u06E4-\\u06E7\\u06E9\\u06EA\\u06ED]', '', text)\n \n\n\n\n \n # Remove longation\n text = re.sub(r'(.)\\1+', r\"\\1\\1\", text) \n \n #Strip vowels from a text, include Shadda.\n text = araby.strip_tashkeel(text)\n \n #Strip diacritics from a text, include harakats and small lettres The striped marks are\n text = araby.strip_diacritics(text)\n text=''.join([i for i in text if not i.isdigit()])\n \n # remove repeated letters\n text = re.sub(r'(\\w)\\1{2,}', r'\\1', text)\n \n return text.strip() # Remove leading and trailing spaces\nimport re\n\ndef convert_emojis(tweet):\n if isinstance(tweet, str):\n for emot in my_emojis:\n pattern = re.escape(emot) # Escape special characters in emot\n replacement = \" \".join(my_emojis[emot].replace(\",\", \"\").replace(\":\", \"\").split())\n tweet = re.sub(f'({pattern})(?!\\w)', rf' {replacement} ',tweet)\n return tweet\n else:\n return tweet if tweet is not None else \"\"\ndef processPost(tweet):\n # Convert emojis\n tweet = convert_emojis(tweet)\n \n # Remove punctuations from the tweet\n tweet = remove_punctuations(tweet)\n\n # Tokenize the cleaned tweet\n tokens = tokenize_tweet(tweet)\n \n # Remove non-Arabic characters\n tokens = remove_non_arabic(' '.join(tokens)).split()\n\n # Normalize Arabic text\n tokens = normalize_arabic(' '.join(tokens)).split()\n \n return ' '.join(tokens) # Join the processed words back into a single string\n\ndata_df['clean_text'] = data_df['tweet'].apply(processPost)", "metadata": {}, "outputs": [], "execution_count": 39 }, { "id": "645df7a1", "cell_type": "code", "source": "data_df.to_csv ('cleaned_50Tweet0.csv',encoding='utf-8-sig')", "metadata": {}, "outputs": [], "execution_count": 7 }, { "id": "ac66c38f", "cell_type": "code", "source": "# remove the \"unwanted\" class\ndata_df['LABEL'] = data_df['LABEL'].astype(int)\ndata_df = data_df[(data_df['LABEL'] != 3) & (data_df['LABEL'] != 1)]", "metadata": {}, "outputs": [], "execution_count": 40 }, { "id": "d8d809ba", "cell_type": "code", "source": "import pandas as pd\n\n# Set the maximum column width to display full text\npd.set_option('display.max_colwidth', None)\n\n# Display the first 3 rows with full text in 'clean_text' and 'tweet' columns\ndata_df[['clean_text', 'tweet']].head(3)\n", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | clean_text | \n", "tweet | \n", "
---|---|---|
0 | \n", "اولا الله يوفقهم ان شاءالله وثانيا الرياضة شي جميل وليس محرم على الجنسين سوا الرجال او الحريم ولكن الفيديو هذا ليس بالسعودية نهايي هذا فيديو خليط منوع من اللقطات من تركيا ولقطة من مدرب امريكي فقط ؟ | \n", "@TRTArabi اولاً الله يوفقهم ان شاءالله\\r\\nوثانياً الرياضة شي جميل وليس محرم على الجنسين سوا الرجال او الحريم\\r\\nولكن الفيديو هذا ليس بالسعودية نهائي\\r\\nهذا فيديو خليط منوع من اللقطات من تركيا ولقطة من مدرب امريكي فقط...!؟ | \n", "
1 | \n", "الاحظ شكثر الحريم اللي يمشون على ممشى البحر وفي الحدايق التفكير فعلا تغير وصارت الناس كلها تمشي وتحب الرياضة | \n", "@llms_ma ألاحظ شكثر الحريم اللي يمشون على ممشى البحر وفي الحدايق التفكير فعلاً تغير وصارت الناس كلها تمشي وتحب الرياضة | \n", "
2 | \n", "حلو بما اننه عجبتش الرياضة استمري فيها وما بس مشي لعبي كورة ان شاء الله كذا نشوفش لاعبة فمنتخبنا مال الحريم يضحك يضحك يضحك يضحك يضحك | \n", "@saam___99 حلو بما اننه عجبتش الرياضة استمري فيها وما بس مشي لعبي كورة ان شاء الله كذا نشوفش لاعبة فمنتخبنا مال الحريم 😂😂😂😂😂 | \n", "