""" @author: Mohamed A. Mostafa """ import pandas as pd import matplotlib.pyplot as plt import geopandas as gpd import folium import webbrowser from nltk.corpus import stopwords from collections import Counter from arabic_reshaper import arabic_reshaper from bidi.algorithm import get_display from wordcloud import WordCloud import ast colors = ["023e8a", "015ba0", "0077b6", "0087bf", "0096c7", "00b4d8", "48cae4", "90e0ef", "ade8f4", "caf0f8"] colors_dark = ["ffbe0b","fd8a09","fb5607","4c5146","3c3f48","2b2c49","8d2843","ef233c","b42626","78290f"] colors_pie = ["0D59A0", "0a9396","94d2bd","e9d8a6","ee9b00","ca6702","bb3e03"] def display_stats(df): print('-' * 60) print('DATASET STATISTICS') print('-' * 60) print('number of claims:', df['claim_id'].nunique()) print('number of tweets:', df['tweet_id'].nunique()) print('count of true tweets:', df['label'].value_counts()[0]) print('count of fake tweets:', df['label'].value_counts()[1]) print('min. number of tweets per claim:', df.groupby('claim_id')['tweet_id'].count().min()) print('max. number of tweets per claim:', df.groupby('claim_id')['tweet_id'].count().max()) print('possibly sensitive tweets:', df['possibly_sensitive'].value_counts()[True]) print('max. true tweets in one claim:', df.groupby(['claim_id', 'label']).size().unstack()[0].max()) print('max. fake tweets in one claim:', df.groupby(['claim_id', 'label']).size().unstack()[1].max()) print('count of unique users:', df['user_id'].nunique()) print('tweets of verified users:', df[df['verified'] == True].shape[0]) print('min. number of tweets per user:', df.groupby('user_id')['tweet_id'].count().min()) print('max. number of tweets per user:', df.groupby('user_id')['tweet_id'].count().max()) print('-' * 60, '\n\n') def plt_chart(x_values, y_values, title='', xlabel='', ylabel='', show_title=False, show_xticks=True, show_yticks=True, data_annotations=None, chart_type='bar', xticks_text=True, xticks_step=1, xticks_angel=0, xticks_align='center', yticks_start=0, yticks_step=10, grid='x', width=10, height=6, marker=None, fillstyle='full', colors=colors): # Set the total width of the figure fig, ax = plt.subplots(figsize=(width, height)) if chart_type == 'bar': plt.bar(x_values, y_values, color=['#' + c for c in colors]) elif chart_type == 'plot': plt.plot(x_values, y_values, color='#' + colors[0], marker=marker, fillstyle=fillstyle) if show_title: ax.set_title(title) ax.set_xlabel(xlabel, labelpad=18) ax.set_ylabel(ylabel, labelpad=18) if show_xticks: if xticks_step > 1: x_ticks = range(min(x_values), int(max(x_values)) + xticks_step, xticks_step) ax.set_xticks(x_ticks) else: ax.set_xticks(x_values) if (xticks_text): ax.set_xticklabels(x_values, rotation=xticks_angel, ha=xticks_align) else: if xticks_step > 1: ax.set_xticklabels(x_ticks, rotation=xticks_angel) else: ax.set_xticklabels(x_values, rotation=xticks_angel) ax.tick_params(axis='x', which='both', width=2, length=6) else: ax.set_xticks([]) if show_yticks: y_ticks = range(yticks_start, int(max(y_values)) + yticks_step, yticks_step) ax.set_yticks(y_ticks) ax.set_yticklabels(y_ticks) ax.tick_params(axis='y', which='both', width=2, length=6) else: ax.set_yticks([]) ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') plt.grid(axis=grid) if data_annotations == 'y-values': for i in range(len(x_values)): plt.annotate(f"{y_values[i]}", (x_values[i], y_values[i]), textcoords="offset points", xytext=(0, 10), ha='center', color='#126782') elif data_annotations is not None: for i in range(len(data_annotations)): plt.text(i, data_annotations[i][0], data_annotations[i][1], ha='center', va='bottom', color='#126782', fontsize=14) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(2) ax.spines['left'].set_linewidth(1.5) ax.set_facecolor('#FCFAEB') plt.tight_layout() plt.show() def fake_true_charts(df): # (a) bar chart of fake- true tweets # Calculate the count of fake and true tweets fake_tweets_count = df[df['label'] == True]['tweet_id'].count() true_tweets_count = df[df['label'] == False]['tweet_id'].count() plt_chart(['Fake', 'True'], [fake_tweets_count, true_tweets_count], 'number-of-true-fake-tweets', '', '', data_annotations=[(11200, '55.1%'), (9200, '44.9%')], xticks_text=True, xticks_angel=0, yticks_start=0, yticks_step=2000, width=2.5, height=6, colors=['FB4444', '6A9B67']) # (b) fake-true for each claim # Group the dataset by claim number and label, and count the number of tweets claim_counts = df.groupby(['claim_id', 'label']).size().unstack() # Plotting the horizontal bar chart fig, ax = plt.subplots(figsize=(10, 6)) plt.barh(claim_counts.index, claim_counts[0], label='True Tweets', height=3, color='green') plt.barh(claim_counts.index, -claim_counts[1], label='Fake Tweets', height=3, color='red') # Add a vertical line at the center plt.axvline(0, color='black', linewidth=1.5) ax.set_xlabel('Number of Tweets', labelpad=18) ax.set_ylabel('Claim #', labelpad=18) ax.set_xticks(range(-400, 401, 100)) ax.set_xticklabels(ax.get_xticks()) ax.set_yticks(range(0, 801, 200)) ax.set_yticklabels(ax.get_yticks()) plt.legend() ax.legend() ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(2) ax.spines['left'].set_linewidth(1.5) ax.tick_params(axis='x', which='both', width=2, length=6) ax.tick_params(axis='y', which='both', width=2, length=6) ax.set_facecolor('#FCFAEB') ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') plt.grid(axis='y') plt.show() def tweets_per_user_chart(df): # Get the list of users and their tweet counts tweets_per_user = df.groupby('user_id')['tweet_id'].count() users = tweets_per_user.index tweet_counts = tweets_per_user.values plt_chart(list(range(len(users))), tweet_counts, 'number-of-tweets-per-user', 'Users', 'Number of Tweets', chart_type='plot', show_xticks=False, yticks_start=0, yticks_step=20, width=15, height=5) def accumulative_users_count(df): tweets_per_user = df.groupby('user_id')['tweet_id'].count() tweets_count = [1, 2, 4, 6, 8, 10, 15, 20, 30, 40, 50, 60, 70, 120] users_no = [len(tweets_per_user[tweets_per_user > count]) for count in tweets_count] tweets_count = [f'>{count}' for count in tweets_count] print(tweets_count) print(users_no) def tweets_timeline_chart(df): df['created_at'] = pd.to_datetime(df['created_at']) # Create new column year df['tweet_year'] = df['created_at'].dt.strftime('%Y') yearly_tweets = df.groupby('tweet_year')['tweet_id'].count() plt_chart(yearly_tweets.index, yearly_tweets.values, title='tweets-timeline', xlabel='Year', ylabel='Number of Tweets', show_title=False, show_xticks=True, show_yticks=True, data_annotations='y-values', chart_type='plot', xticks_text=True, xticks_angel=0, yticks_start=0, yticks_step=1000, grid='x', width=10, height=6, colors=colors, marker='o') def users_locations_heatmap(df): df_iso_3 = pd.read_excel('locations-iso-mapping.xlsx') merged_loc_df = pd.merge(df['location'], df_iso_3, on='location', how='outer') locations = merged_loc_df['iso_a3'].value_counts() # Read the world shapefile world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) merged = world.merge(locations[locations > 10], on='iso_a3', how='inner') # Assuming you have a merged DataFrame named 'merged' with geometry, count, name columns # Create a map object world_map = folium.Map(location=[58.4052172, -109.6062729], zoom_start=2.5, tiles='CartoDB positron', scrollWheelZoom=False, dragging=True) # Create a Choropleth layer with the count data choropleth_layer = folium.Choropleth( #geo_data='world-countries.json', geo_data='https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/world-countries.json', name='count', data=merged, columns=['name', 'count'], key_on='feature.properties.name', fill_color='PuBu', fill_opacity=0.6, line_opacity=0.05, bins=[11, 50, 102, 400, 2448], legend_name='Count', highlight=True, nan_fill_color='white' ).add_to(world_map) # Add the Choropleth layer to the map folium.LayerControl().add_to(world_map) # Save the map to an HTML file world_map.save('results/world_map.html') # Open the map in a web browser webbrowser.open('results/world_map.html') def display_users_stats(df): print('-' * 60) print('USERS STATISTICS') print('-' * 60) print('avg. number of followers:', int(df['followers_count'].mean())) print('avg. number of following:', int(df['following_count'].mean())) print('avg. number of tweets:', int(df['tweet_count'].mean())) print('avg. number of listed:', int(df['listed_count'].mean())) print('number of distinct verified users:', len(df[df['verified'] == True]['user_id'].unique())) print('number of fake tweets by verified users:', len(df[(df['label'] == 1) & (df['verified'] == 1)])) print('number of true tweets by verified users:', len(df[(df['label'] == 0) & (df['verified'] == 1)])) print('-' * 60, '\n\n') def user_accounts_charts(df): df['user_created_at'] = pd.to_datetime(df['user_created_at']) df['user_year'] = df['user_created_at'].dt.strftime('%Y') yearly_users = df.groupby('user_year')['user_id'].count() plt_chart(yearly_users.index, yearly_users.values, title='users-accounts-timeline', xlabel='', ylabel='Number of User Accounts', show_title=False, show_xticks=True, show_yticks=True, data_annotations='y-values', chart_type='bar', xticks_text=True, xticks_angel=30, yticks_start=0, yticks_step=500, grid='x', width=10, height=5, colors=colors_dark, marker='o') def account_age_chart(df): # Calculate the difference of user account in months df['created_at'] = pd.to_datetime(df['created_at']) df['user_created_at'] = pd.to_datetime(df['user_created_at']) df['user_acc_months_age'] = df.apply(lambda row: (row['created_at'].year - row['user_created_at'].year) * 12 + ( row['created_at'].month - row['user_created_at'].month), axis=1) user_acc_months_age = df['user_acc_months_age'].value_counts() user_acc_months_age_sorted = user_acc_months_age.sort_index(ascending=True) plt_chart(user_acc_months_age_sorted.index, user_acc_months_age_sorted.values, title='users-accounts-monhts-age', xlabel='Difference in Months between Profile Date and Tweet Date', ylabel='Count', show_title=False, show_xticks=True, show_yticks=True, data_annotations=None, chart_type='plot', xticks_text=False, xticks_angel=0, xticks_step=10, yticks_start=0, yticks_step=50, grid='x', width=10, height=6, colors=colors) def annotation_types_chart(df): annotations_type = df[['organizations_count', 'persons_count', 'places_count', 'products_count']].sum() plt_chart(annotations_type.index, annotations_type.values, title='Annotations Count', xlabel='', xticks_align='right', ylabel='Count', yticks_step=2000, xticks_text=True, xticks_angel=25, width=4, height=4, colors=['fb5607', '4c5146', '2b2c49', 'b42626']) def list_to_word_cloud(lst, is_arabic=True): # Load the Arabic stop words stop_words = set(stopwords.words('arabic')) text_reshaped = [] for item in lst: if item not in stop_words and len(item) > 2: try: if is_arabic: text_reshaped.append(get_display(arabic_reshaper.reshape(item))) else: text_reshaped.append(item) except: pass text_counter = Counter(text_reshaped) plt.imshow(WordCloud(background_color='white', font_path='C:\Windows\Fonts\Arial.ttf').generate_from_frequencies(text_counter)) # YOU CAN REPLACE font_path WITH ANY FONT SUPPORTS ARABIC plt.axis("off") plt.title('') plt.show() def annotation_word_cloud(df): df['annotations_text'] = df['annotations_text'].apply(ast.literal_eval) annotations_text_list = [] for index, row in df.iterrows(): sub_list = row['annotations_text'] annotations_text_list += sub_list list_to_word_cloud(annotations_text_list) def plt_pie_chart(sizes, labels, width=8, height=6, colors=colors_pie): wedges, _ = plt.pie(sizes, labels=[''] * len(labels), colors=['#' + c for c in colors], startangle=90, counterclock=False, wedgeprops={'edgecolor': 'white'}) center_circle = plt.Circle((0, 0), 0.6, color='white') plt.gca().add_artist(center_circle) # Create the legend with labels and percentages total = sum(sizes) leg_labels = [] for label, size in zip(labels, sizes): leg_labels.append('{: <16} {: >4} (~{:0>2d}%)'.format(label, size, round(size * 100 / total))) plt.legend(wedges, labels, loc="center left", bbox_to_anchor=(1, 0.5), labels=leg_labels) # Adjust figure size and padding fig = plt.gcf() fig.set_size_inches(width, height) # Set the figure size in inches plt.subplots_adjust(right=0.45) # Adjust the right padding plt.axis('equal') plt.show() def news_domains_chart(df): domains = df['domain_label'].value_counts().sort_values(ascending=True) plt_pie_chart(domains.values, domains.index, height=3) def plt_stack_chart(x_values, y_values_bottom, y_values_top, ylabel='', bottom_label='', top_label='', show_xticks=True, show_yticks=True, xticks_align='center', xticks_angel=0, yticks_start=0, yticks_step=10, yticks_end=100, grid='x', width=10, height=6, show_legend=True, colors=colors): # Set the total width of the figure fig, ax = plt.subplots(figsize=(width, height)) plt.bar(x_values, y_values_bottom, label=bottom_label, color=colors[0]) plt.bar(x_values, y_values_top, bottom=y_values_bottom, label=top_label, color=colors[1]) ax.set_ylabel(ylabel, labelpad=18) if show_xticks: ax.set_xticklabels(x_values, rotation=xticks_angel, ha=xticks_align) ax.tick_params(axis='x', which='both', width=2, length=6) else: ax.set_xticks([]) if show_yticks: y_ticks = range(yticks_start, int(yticks_end) + 1, yticks_step) ax.set_yticks(y_ticks) ax.set_yticklabels(y_ticks) ax.tick_params(axis='y', which='both', width=2, length=6) else: ax.set_yticks([]) ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray') plt.grid(axis=grid) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(2) ax.spines['left'].set_linewidth(1.5) if show_legend: plt.legend() ax.set_facecolor('#FCFAEB') plt.tight_layout() plt.show() def news_domains_bar(df): domain_labels_counts = df.groupby(['domain_label', 'label']).size().unstack(fill_value=0) # Extract the distinct values of 'domain' for the x-axis x_values = domain_labels_counts.index # Get the counts for 'label' values 1 and 0 for the y-axis y_values_fake = domain_labels_counts[1] y_values_true = domain_labels_counts[0] plt_stack_chart(x_values, y_values_fake, y_values_true, ylabel='Number of tweets', bottom_label='Fake', top_label='True', show_xticks=True, show_yticks=True, xticks_align='right', xticks_angel=50, yticks_start=0, yticks_step=500, yticks_end=5000, grid='x', width=5, height=4, show_legend=True, colors=['#FF6700', '#3A6EA5']) def text_to_word_cloud(text, is_arabic=True): words = text.split() # Remove stop words or words len < 3 stop_words = set(stopwords.words('arabic')) words = [word for word in words if word.lower() not in stop_words and len(word) > 2] text = ' '.join(words) if is_arabic: text = get_display(arabic_reshaper.reshape(str(text))) plt.imshow(WordCloud(colormap='Dark2_r', font_path='C:\Windows\Fonts\Arial.ttf', background_color='white').generate(text)) # YOU CAN REPLACE font_path WITH ANY FONT SUPPORTS ARABIC plt.axis("off") plt.title('') plt.show() def selected_words_cloud(df, domain_name): df_domain = df[df['domain_label'] == domain_name] text = df_domain['text_arabic'].str.cat(sep=' ') text_to_word_cloud(text) def main(): #nltk.download('stopwords') # Loading Dataset File df = pd.read_excel('VERA-ARAB_dataset.xlsx') #df = pd.read_excel('06-VERA-ARB-Dataset.xlsx') df['text_arabic'] = df['text_arabic'].fillna('') print('\n\nrows and columns', df.shape, '\n\n') # Dataset Statistics display_stats(df) # Tweets Distribution fake_true_charts(df) # Tweets per each user chart tweets_per_user_chart(df) # Accumulative number of users accumulative_users_count(df) # Tweets timeline tweets_timeline_chart(df) # Heatmap of users locations users_locations_heatmap(df) # Users Statistics display_users_stats(df) # Users Accounts by Year Chart user_accounts_charts(df) # Count of user accounts ages by months at the time of publish account_age_chart(df) # Text annotations in tweets # (a) shows the total count of annotated text by each annotation type annotation_types_chart(df) # (b) shows words cloud of the most frequent annotated text annotation_word_cloud(df) # Tweets classifications according to news domain news_domains_chart(df) # Fake/True labels for each news domain news_domains_bar(df) # Words cloud of selected domains # (a) sports news selected_words_cloud(df, 'Sports') # (b) political news selected_words_cloud(df, 'Politics') # (c) armed conflict news selected_words_cloud(df, 'Armed Conflict') print('-' * 30, 'FINISHED', '-' * 30) if __name__ == "__main__": main()