"""
@author: Mohamed A. Mostafa
"""

import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
import webbrowser
from nltk.corpus import stopwords
from collections import Counter
from arabic_reshaper import arabic_reshaper
from bidi.algorithm import get_display
from wordcloud import WordCloud
import ast


colors = ["023e8a", "015ba0", "0077b6", "0087bf", "0096c7", "00b4d8", "48cae4", "90e0ef", "ade8f4", "caf0f8"]
colors_dark = ["ffbe0b","fd8a09","fb5607","4c5146","3c3f48","2b2c49","8d2843","ef233c","b42626","78290f"]
colors_pie = ["0D59A0", "0a9396","94d2bd","e9d8a6","ee9b00","ca6702","bb3e03"]


def display_stats(df):
    print('-' * 60)
    print('DATASET STATISTICS')
    print('-' * 60)
    print('number of claims:', df['claim_id'].nunique())
    print('number of tweets:', df['tweet_id'].nunique())
    print('count of true tweets:', df['label'].value_counts()[0])
    print('count of fake tweets:', df['label'].value_counts()[1])
    print('min. number of tweets per claim:', df.groupby('claim_id')['tweet_id'].count().min())
    print('max. number of tweets per claim:', df.groupby('claim_id')['tweet_id'].count().max())
    print('possibly sensitive tweets:', df['possibly_sensitive'].value_counts()[True])
    print('max. true tweets in one claim:', df.groupby(['claim_id', 'label']).size().unstack()[0].max())
    print('max. fake tweets in one claim:', df.groupby(['claim_id', 'label']).size().unstack()[1].max())
    print('count of unique users:', df['user_id'].nunique())
    print('tweets of verified users:', df[df['verified'] == True].shape[0])
    print('min. number of tweets per user:', df.groupby('user_id')['tweet_id'].count().min())
    print('max. number of tweets per user:', df.groupby('user_id')['tweet_id'].count().max())
    print('-' * 60, '\n\n')


def plt_chart(x_values, y_values, title='', xlabel='', ylabel='', show_title=False, show_xticks=True, show_yticks=True,
              data_annotations=None, chart_type='bar', xticks_text=True, xticks_step=1, xticks_angel=0,
              xticks_align='center', yticks_start=0,
              yticks_step=10, grid='x', width=10, height=6, marker=None, fillstyle='full', colors=colors):
    # Set the total width of the figure
    fig, ax = plt.subplots(figsize=(width, height))

    if chart_type == 'bar':
        plt.bar(x_values, y_values, color=['#' + c for c in colors])
    elif chart_type == 'plot':
        plt.plot(x_values, y_values, color='#' + colors[0], marker=marker, fillstyle=fillstyle)

    if show_title:
        ax.set_title(title)
    ax.set_xlabel(xlabel, labelpad=18)
    ax.set_ylabel(ylabel, labelpad=18)

    if show_xticks:
        if xticks_step > 1:
            x_ticks = range(min(x_values), int(max(x_values)) + xticks_step, xticks_step)
            ax.set_xticks(x_ticks)
        else:
            ax.set_xticks(x_values)
        if (xticks_text):
            ax.set_xticklabels(x_values, rotation=xticks_angel, ha=xticks_align)
        else:
            if xticks_step > 1:
                ax.set_xticklabels(x_ticks, rotation=xticks_angel)
            else:
                ax.set_xticklabels(x_values, rotation=xticks_angel)
        ax.tick_params(axis='x', which='both', width=2, length=6)
    else:
        ax.set_xticks([])

    if show_yticks:
        y_ticks = range(yticks_start, int(max(y_values)) + yticks_step, yticks_step)
        ax.set_yticks(y_ticks)
        ax.set_yticklabels(y_ticks)
        ax.tick_params(axis='y', which='both', width=2, length=6)
    else:
        ax.set_yticks([])

    ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    plt.grid(axis=grid)

    if data_annotations == 'y-values':
        for i in range(len(x_values)):
            plt.annotate(f"{y_values[i]}", (x_values[i], y_values[i]), textcoords="offset points", xytext=(0, 10),
                         ha='center', color='#126782')
    elif data_annotations is not None:
        for i in range(len(data_annotations)):
            plt.text(i, data_annotations[i][0], data_annotations[i][1], ha='center', va='bottom', color='#126782',
                     fontsize=14)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_linewidth(2)
    ax.spines['left'].set_linewidth(1.5)

    ax.set_facecolor('#FCFAEB')
    plt.tight_layout()
    plt.show()


def fake_true_charts(df):
    # (a) bar chart of fake- true tweets

    # Calculate the count of fake and true tweets
    fake_tweets_count = df[df['label'] == True]['tweet_id'].count()
    true_tweets_count = df[df['label'] == False]['tweet_id'].count()

    plt_chart(['Fake', 'True'], [fake_tweets_count, true_tweets_count],
              'number-of-true-fake-tweets', '', '', data_annotations=[(11200, '55.1%'), (9200, '44.9%')],
              xticks_text=True, xticks_angel=0, yticks_start=0, yticks_step=2000, width=2.5, height=6,
              colors=['FB4444', '6A9B67'])

    # (b) fake-true for each claim
    # Group the dataset by claim number and label, and count the number of tweets
    claim_counts = df.groupby(['claim_id', 'label']).size().unstack()

    # Plotting the horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.barh(claim_counts.index, claim_counts[0], label='True Tweets', height=3, color='green')
    plt.barh(claim_counts.index, -claim_counts[1], label='Fake Tweets', height=3, color='red')

    # Add a vertical line at the center
    plt.axvline(0, color='black', linewidth=1.5)

    ax.set_xlabel('Number of Tweets', labelpad=18)
    ax.set_ylabel('Claim #', labelpad=18)

    ax.set_xticks(range(-400, 401, 100))
    ax.set_xticklabels(ax.get_xticks())

    ax.set_yticks(range(0, 801, 200))
    ax.set_yticklabels(ax.get_yticks())

    plt.legend()
    ax.legend()

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_linewidth(2)
    ax.spines['left'].set_linewidth(1.5)

    ax.tick_params(axis='x', which='both', width=2, length=6)
    ax.tick_params(axis='y', which='both', width=2, length=6)

    ax.set_facecolor('#FCFAEB')
    ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    plt.grid(axis='y')

    plt.show()


def tweets_per_user_chart(df):
    # Get the list of users and their tweet counts
    tweets_per_user = df.groupby('user_id')['tweet_id'].count()
    users = tweets_per_user.index
    tweet_counts = tweets_per_user.values

    plt_chart(list(range(len(users))), tweet_counts, 'number-of-tweets-per-user', 'Users', 'Number of Tweets',
              chart_type='plot',
              show_xticks=False, yticks_start=0, yticks_step=20, width=15, height=5)


def accumulative_users_count(df):
    tweets_per_user = df.groupby('user_id')['tweet_id'].count()
    tweets_count = [1, 2, 4, 6, 8, 10, 15, 20, 30, 40, 50, 60, 70, 120]
    users_no = [len(tweets_per_user[tweets_per_user > count]) for count in tweets_count]
    tweets_count = [f'>{count}' for count in tweets_count]
    print(tweets_count)
    print(users_no)


def tweets_timeline_chart(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    # Create new column year
    df['tweet_year'] = df['created_at'].dt.strftime('%Y')
    yearly_tweets = df.groupby('tweet_year')['tweet_id'].count()

    plt_chart(yearly_tweets.index, yearly_tweets.values, title='tweets-timeline', xlabel='Year',
              ylabel='Number of Tweets',
              show_title=False, show_xticks=True, show_yticks=True, data_annotations='y-values', chart_type='plot',
              xticks_text=True, xticks_angel=0, yticks_start=0, yticks_step=1000, grid='x', width=10, height=6,
              colors=colors, marker='o')


def users_locations_heatmap(df):
    df_iso_3 = pd.read_excel('locations-iso-mapping.xlsx')
    merged_loc_df = pd.merge(df['location'], df_iso_3, on='location', how='outer')
    locations = merged_loc_df['iso_a3'].value_counts()
    # Read the world shapefile
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    merged = world.merge(locations[locations > 10], on='iso_a3', how='inner')
    # Assuming you have a merged DataFrame named 'merged' with geometry, count, name columns

    # Create a map object
    world_map = folium.Map(location=[58.4052172, -109.6062729], zoom_start=2.5, tiles='CartoDB positron',
                           scrollWheelZoom=False, dragging=True)

    # Create a Choropleth layer with the count data
    choropleth_layer = folium.Choropleth(
        #geo_data='world-countries.json',
        geo_data='https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/world-countries.json',
        name='count',
        data=merged,
        columns=['name', 'count'],
        key_on='feature.properties.name',
        fill_color='PuBu',
        fill_opacity=0.6,
        line_opacity=0.05,
        bins=[11, 50, 102, 400, 2448],
        legend_name='Count',
        highlight=True,
        nan_fill_color='white'
    ).add_to(world_map)

    # Add the Choropleth layer to the map
    folium.LayerControl().add_to(world_map)

    # Save the map to an HTML file
    world_map.save('results/world_map.html')

    # Open the map in a web browser
    webbrowser.open('results/world_map.html')


def display_users_stats(df):
    print('-' * 60)
    print('USERS STATISTICS')
    print('-' * 60)
    print('avg. number of followers:', int(df['followers_count'].mean()))
    print('avg. number of following:', int(df['following_count'].mean()))
    print('avg. number of tweets:', int(df['tweet_count'].mean()))
    print('avg. number of listed:', int(df['listed_count'].mean()))
    print('number of distinct verified users:', len(df[df['verified'] == True]['user_id'].unique()))
    print('number of fake tweets by verified users:', len(df[(df['label'] == 1) & (df['verified'] == 1)]))
    print('number of true tweets by verified users:', len(df[(df['label'] == 0) & (df['verified'] == 1)]))
    print('-' * 60, '\n\n')


def user_accounts_charts(df):
    df['user_created_at'] = pd.to_datetime(df['user_created_at'])
    df['user_year'] = df['user_created_at'].dt.strftime('%Y')
    yearly_users = df.groupby('user_year')['user_id'].count()
    plt_chart(yearly_users.index, yearly_users.values, title='users-accounts-timeline', xlabel='',
              ylabel='Number of User Accounts', show_title=False, show_xticks=True, show_yticks=True,
              data_annotations='y-values', chart_type='bar', xticks_text=True, xticks_angel=30, yticks_start=0,
              yticks_step=500, grid='x', width=10, height=5, colors=colors_dark, marker='o')


def account_age_chart(df):
    # Calculate the difference of user account in months
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['user_created_at'] = pd.to_datetime(df['user_created_at'])
    df['user_acc_months_age'] = df.apply(lambda row: (row['created_at'].year - row['user_created_at'].year) * 12 + (
                row['created_at'].month - row['user_created_at'].month), axis=1)
    user_acc_months_age = df['user_acc_months_age'].value_counts()
    user_acc_months_age_sorted = user_acc_months_age.sort_index(ascending=True)
    plt_chart(user_acc_months_age_sorted.index, user_acc_months_age_sorted.values, title='users-accounts-monhts-age',
              xlabel='Difference in Months between Profile Date and Tweet Date', ylabel='Count', show_title=False,
              show_xticks=True, show_yticks=True,
              data_annotations=None, chart_type='plot', xticks_text=False, xticks_angel=0,
              xticks_step=10, yticks_start=0, yticks_step=50, grid='x', width=10, height=6, colors=colors)


def annotation_types_chart(df):
    annotations_type = df[['organizations_count', 'persons_count', 'places_count', 'products_count']].sum()

    plt_chart(annotations_type.index, annotations_type.values, title='Annotations Count', xlabel='',
              xticks_align='right', ylabel='Count', yticks_step=2000,
              xticks_text=True, xticks_angel=25, width=4, height=4,
              colors=['fb5607', '4c5146', '2b2c49', 'b42626'])


def list_to_word_cloud(lst, is_arabic=True):
    # Load the Arabic stop words
    stop_words = set(stopwords.words('arabic'))

    text_reshaped = []
    for item in lst:
        if item not in stop_words and len(item) > 2:
            try:
                if is_arabic:
                    text_reshaped.append(get_display(arabic_reshaper.reshape(item)))
                else:
                    text_reshaped.append(item)
            except:
                pass
    text_counter = Counter(text_reshaped)
    plt.imshow(WordCloud(background_color='white', font_path='C:\Windows\Fonts\Arial.ttf').generate_from_frequencies(text_counter))
    # YOU CAN REPLACE font_path WITH ANY FONT SUPPORTS ARABIC
    plt.axis("off")
    plt.title('')
    plt.show()


def annotation_word_cloud(df):
    df['annotations_text'] = df['annotations_text'].apply(ast.literal_eval)
    annotations_text_list = []
    for index, row in df.iterrows():
        sub_list = row['annotations_text']
        annotations_text_list += sub_list
    list_to_word_cloud(annotations_text_list)


def plt_pie_chart(sizes, labels, width=8, height=6, colors=colors_pie):
    wedges, _ = plt.pie(sizes, labels=[''] * len(labels), colors=['#' + c for c in colors], startangle=90,
                        counterclock=False,
                        wedgeprops={'edgecolor': 'white'})

    center_circle = plt.Circle((0, 0), 0.6, color='white')
    plt.gca().add_artist(center_circle)

    # Create the legend with labels and percentages
    total = sum(sizes)
    leg_labels = []
    for label, size in zip(labels, sizes):
        leg_labels.append('{: <16} {: >4} (~{:0>2d}%)'.format(label, size, round(size * 100 / total)))

    plt.legend(wedges, labels, loc="center left", bbox_to_anchor=(1, 0.5), labels=leg_labels)

    # Adjust figure size and padding
    fig = plt.gcf()
    fig.set_size_inches(width, height)  # Set the figure size in inches
    plt.subplots_adjust(right=0.45)  # Adjust the right padding

    plt.axis('equal')
    plt.show()


def news_domains_chart(df):
    domains = df['domain_label'].value_counts().sort_values(ascending=True)
    plt_pie_chart(domains.values, domains.index, height=3)


def plt_stack_chart(x_values, y_values_bottom, y_values_top, ylabel='', bottom_label='',
                    top_label='',
                    show_xticks=True, show_yticks=True, xticks_align='center',
                    xticks_angel=0, yticks_start=0, yticks_step=10, yticks_end=100, grid='x', width=10, height=6,
                    show_legend=True,
                    colors=colors):
    # Set the total width of the figure
    fig, ax = plt.subplots(figsize=(width, height))

    plt.bar(x_values, y_values_bottom, label=bottom_label, color=colors[0])
    plt.bar(x_values, y_values_top, bottom=y_values_bottom, label=top_label, color=colors[1])

    ax.set_ylabel(ylabel, labelpad=18)

    if show_xticks:
        ax.set_xticklabels(x_values, rotation=xticks_angel, ha=xticks_align)
        ax.tick_params(axis='x', which='both', width=2, length=6)
    else:
        ax.set_xticks([])

    if show_yticks:
        y_ticks = range(yticks_start, int(yticks_end) + 1, yticks_step)
        ax.set_yticks(y_ticks)
        ax.set_yticklabels(y_ticks)
        ax.tick_params(axis='y', which='both', width=2, length=6)
    else:
        ax.set_yticks([])

    ax.yaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    ax.xaxis.grid(True, linestyle='-.', linewidth=0.5, color='gray')
    plt.grid(axis=grid)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_linewidth(2)
    ax.spines['left'].set_linewidth(1.5)

    if show_legend:
        plt.legend()

    ax.set_facecolor('#FCFAEB')
    plt.tight_layout()
    plt.show()


def news_domains_bar(df):
    domain_labels_counts = df.groupby(['domain_label', 'label']).size().unstack(fill_value=0)
    # Extract the distinct values of 'domain' for the x-axis
    x_values = domain_labels_counts.index

    # Get the counts for 'label' values 1 and 0 for the y-axis
    y_values_fake = domain_labels_counts[1]
    y_values_true = domain_labels_counts[0]

    plt_stack_chart(x_values, y_values_fake, y_values_true,
                    ylabel='Number of tweets', bottom_label='Fake', top_label='True',
                    show_xticks=True, show_yticks=True, xticks_align='right',
                    xticks_angel=50, yticks_start=0, yticks_step=500, yticks_end=5000,
                    grid='x', width=5, height=4, show_legend=True, colors=['#FF6700', '#3A6EA5'])


def text_to_word_cloud(text, is_arabic=True):
    words = text.split()

    # Remove stop words or words len < 3
    stop_words = set(stopwords.words('arabic'))
    words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
    text = ' '.join(words)
    if is_arabic:
        text = get_display(arabic_reshaper.reshape(str(text)))

    plt.imshow(WordCloud(colormap='Dark2_r', font_path='C:\Windows\Fonts\Arial.ttf', background_color='white').generate(text))
    # YOU CAN REPLACE font_path WITH ANY FONT SUPPORTS ARABIC
    plt.axis("off")
    plt.title('')
    plt.show()


def selected_words_cloud(df, domain_name):
    df_domain = df[df['domain_label'] == domain_name]
    text = df_domain['text_arabic'].str.cat(sep=' ')
    text_to_word_cloud(text)


def main():
    #nltk.download('stopwords')


    # Loading Dataset File
    df = pd.read_excel('VERA-ARAB_dataset.xlsx')
    #df = pd.read_excel('06-VERA-ARB-Dataset.xlsx')

    df['text_arabic'] = df['text_arabic'].fillna('')

    print('\n\nrows and columns', df.shape, '\n\n')

    # Dataset Statistics
    display_stats(df)

    # Tweets Distribution
    fake_true_charts(df)

    # Tweets per each user chart
    tweets_per_user_chart(df)

    # Accumulative number of users
    accumulative_users_count(df)

    # Tweets timeline
    tweets_timeline_chart(df)

    # Heatmap of users locations
    users_locations_heatmap(df)

    # Users Statistics
    display_users_stats(df)

    # Users Accounts by Year Chart
    user_accounts_charts(df)

    # Count of user accounts ages by months at the time of publish
    account_age_chart(df)

    # Text annotations in tweets
    # (a) shows the total count of annotated text by each annotation type
    annotation_types_chart(df)

    # (b) shows words cloud of the most frequent annotated text
    annotation_word_cloud(df)

    # Tweets classifications according to news domain
    news_domains_chart(df)

    # Fake/True labels for each news domain
    news_domains_bar(df)

    # Words cloud of selected domains
    # (a) sports news
    selected_words_cloud(df, 'Sports')
    # (b) political news
    selected_words_cloud(df, 'Politics')
    # (c) armed conflict news
    selected_words_cloud(df, 'Armed Conflict')

    print('-' * 30, 'FINISHED', '-' * 30)


if __name__ == "__main__":
    main()