##**Exploratory and Causal Analysis of the Climate Change Dataset**

# Libraries


In [None]:
!pip install lingam
!pip install tigramite
!pip install statsmodels
!pip install efficient-apriori

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from efficient_apriori import apriori
from graphviz import Digraph
from google.colab import drive
from google.colab import files

In [None]:
drive.mount('/content/drive', force_remount=True)

# Read DataFrames

In [None]:
# The file "The Climate Change Twitter Dataset.csv" is available at https://www.kaggle.com/datasets/deffro/the-climate-change-twitter-dataset

tweets = pd.read_csv('/content/drive/My Drive/Dataset/The Climate Change Twitter Dataset.csv')

tweets.info()

In [None]:
tweets.loc[(tweets['topic'] == 'Politics') & (tweets['aggressiveness'] == 'aggressive')].iloc[-20000:]

In [None]:
df = tweets
df['created_at'] = pd.to_datetime(df['created_at'])
df['topic'] = df['topic'].replace('Importance of Human Intervantion', 'Importance of Human Intervention') # Adapted to fix typo
df['topic'] = df['topic'].replace('Donald Trump versus Science', 'Denialist Politicians versus Science') # Adapted by journal request

years = df['created_at'].dt.year.unique()
topics = df['topic'].unique()
minSentiment = df['sentiment'].min()
maxSentiment = df['sentiment'].max()
stances = df['stance'].unique()
aggressiveness = df['aggressiveness'].unique()

print(f"Years: [{years.min()}, {years.max()}]")
print(f"Topics: {topics}")
print(f"Sentiments: [{round(minSentiment, 4)}, {round(maxSentiment, 4)}]")
print(f"Stances: {stances}")
print(f"Aggressiveness: {aggressiveness}")

In [None]:
# The file "disasters.csv" is available at https://www.kaggle.com/datasets/deffro/the-climate-change-twitter-dataset

disasters = pd.read_csv('/content/drive/My Drive/Dataset/disasters.csv')
disasters['start_date'] = pd.to_datetime(disasters['start_date'])
disasters['end_date'] = pd.to_datetime(disasters['end_date'])
disasters.info()

In [None]:
print(f"Disaster Types: {disasters['Disaster Type'].unique()}")

print(f"Disaster Subtypes: {disasters['Disaster Subtype'].unique()}")

print(f"Dates : [{disasters['start_date'].min()}, {disasters['start_date'].max()}]")

# Plots

In [None]:
from matplotlib import pyplot as plt
from datetime import datetime
def normalize(value, min_val, max_val):
    return (value - min_val) / (max_val - min_val)

def truncateNumber(number, digits):
    magnitude = 10 ** (len(str(number)) - digits)
    return int(number / magnitude) * magnitude

def plotTweets(year=None, topics=None, groupBy="week", stances=None, aggressiveness=None):
    dfToPlot = df

    ## Filter dataframe to plot
    if not topics is None:
        dfToPlot = dfToPlot.loc[dfToPlot['topic'].isin(topics)]
    if not year is None:
        dfToPlot = dfToPlot.loc[dfToPlot['created_at'].dt.year == year]
    elif groupBy != "topic":
        groupBy = "year"
    if not stances is None:
        dfToPlot = dfToPlot.loc[dfToPlot['stance'].isin(stances)]
    if not aggressiveness is None:
        dfToPlot = dfToPlot.loc[dfToPlot['aggressiveness'].isin(aggressiveness)]

    if groupBy == "topic":
        dfGrouped = dfToPlot.groupby(dfToPlot["topic"])
    elif groupBy == "year":
        dfGrouped = dfToPlot.groupby(dfToPlot["created_at"].dt.year)
    elif groupBy == "month":
        dfGrouped = dfToPlot.groupby(dfToPlot["created_at"].dt.month)
    elif groupBy == "week":
        dfGrouped = dfToPlot.groupby(dfToPlot["created_at"].dt.isocalendar().week)
    else:
        dfGrouped = dfToPlot.groupby(dfToPlot["created_at"].dt.dayofyear)

    marker = "o" if groupBy == "topic" else ""
    linestyle = "" if groupBy == "topic" else "-"
    rotation = 90 if groupBy == "topic" else 0


    ## Plot number of tweets
    numberOfTweets = dfGrouped["id"].count()
    numberOfTweetsNormalized = normalize(numberOfTweets, 0, truncateNumber(numberOfTweets.max(), 2))
    plt.bar(numberOfTweetsNormalized.index, numberOfTweetsNormalized.values, width=1, color="0.8", label="Number of Tweets")

    ## Plot average sentiment
    averageSentiment = dfGrouped.sentiment.mean()
    averageSentimentNormalized = normalize(averageSentiment, -1, 1)
    plt.plot(averageSentimentNormalized, color="blue", marker=marker, linestyle=linestyle, label="Sentiment")

    ## Plot aggressiveness
    aggressivenessPercentages = dfGrouped["aggressiveness"].value_counts(normalize=True)
    aggressiveTweetsPercentages = aggressivenessPercentages.loc[aggressivenessPercentages.index.get_level_values('aggressiveness') == 'aggressive']
    aggressiveTweetsPercentages = aggressiveTweetsPercentages.reset_index(level='aggressiveness', drop=True)
    plt.plot(aggressiveTweetsPercentages, color="red", marker=marker, linestyle=linestyle, label="Aggressiveness")

    ## Plot deniers
    stancePercentages = dfGrouped["stance"].value_counts(normalize=True)
    stanceTweetsPercentages = stancePercentages.loc[stancePercentages.index.get_level_values('stance') == 'denier']
    stanceTweetsPercentages = stanceTweetsPercentages.reset_index(level='stance', drop=True)
    plt.plot(stanceTweetsPercentages, color="green", marker=marker, linestyle=linestyle, label="Deniers")

    ## Customize ticks
    max_value = truncateNumber(dfGrouped["id"].count().max(), 2)
    if max_value > 10:
      tick_interval = max_value * 0.2
      tick_positions = np.arange(0, max_value + tick_interval, tick_interval)
      tick_labels = ["{:,}".format(int(tick)) for tick in tick_positions]
      plt.yticks(np.arange(0, 1.1, 0.2), tick_labels)
    else:
      plt.yticks(np.arange(0, 1.1, 1), [0, int(max_value)])

    ## Customize labels
    plt.xlabel(groupBy.capitalize())
    plt.xticks(rotation=rotation)
    plt.ylabel("Number of Tweets")
    plt.title("Tweets" + ((" " + str(year)) if year else "") + ((" (" + ', '.join(topics) + ")") if topics else "") + ((" (" + ', '.join([word.capitalize() + 's' for word in stances]) + ")") if stances else "") + ((" (" + ', '.join([word.capitalize() for word in aggressiveness]) + ")") if aggressiveness else ""))

    plt.legend()

    ## Add horizontal lines
    for i in range(11):
        plt.axhline(y=i * 0.1, color='0.7', linestyle='--', linewidth=0.7)

    # Add ticks for the right side of the Y-axis
    right_tick_positions = np.arange(0, 1.1, 0.2)
    right_tick_labels = ["{}%".format(int(tick * 100)) for tick in right_tick_positions]
    ax2 = plt.gca().twinx()
    ax2.set_yticks(np.arange(0, 1.1, 0.2) * 0.951 / (dfGrouped["id"].count().max() / max_value))
    ax2.set_yticklabels(right_tick_labels)
    ax2.tick_params(axis='y', which='both', length=0)
    ax2.set_ylabel('Sentiment, Aggressiveness and Deniers')


    now = datetime.now()
    timestamp = now.strftime('%Y-%m-%d_%H-%M-%S') + f'_{now.microsecond // 1000}'
    filename = f'plot_{timestamp}.pdf'

    # Save the plot as a PDF with the generated filename
    plt.savefig(filename,bbox_inches='tight')
    ## Show
    plt.show()

In [None]:
def generatePlots():
    ## General
    plotTweets()
    plotTweets(groupBy="topic")

    '''
    ## Years
    for year in years:
        plotTweets(year=year)

    ## Topics
    for topic in topics:
        plotTweets(topics=[topic])


    ## Deniers vs Not Deniers
    plotTweets(stances=["denier"])
    plotTweets(stances=["neutral", "believer"])

    ## Aggressive vs Not Aggressive
    plotTweets(aggressiveness=["aggressive"])
    plotTweets(aggressiveness=["not aggressive"])
    '''

generatePlots()

In [None]:
def plot_disasters(df):

  disastersGrouped = df.groupby(df["start_date"].dt.year)
  numberOfDisasters = disastersGrouped["Disaster Type"].count()

  plt.bar(numberOfDisasters.index, numberOfDisasters.values)
  plt.title("Disasters")
  plt.show()

plot_disasters(disasters)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:


def plot_disasters(df):

  # disastersGrouped = df.groupby(df["start_date"].dt.year)
  # numberOfDisasters = disastersGrouped["Disaster Type"].count()

  disastersGrouped = df.groupby([df["start_date"].dt.year, "Disaster Type"]).size().unstack(fill_value=0)
  disastersGrouped.plot.bar(stacked=True, figsize=(10, 6))

  plt.title("Disasters by Year and Type")
  plt.xlabel("Year")
  plt.ylabel("Number of Disasters")
  plt.legend(title='Disaster Type', bbox_to_anchor=(1.05, 1), loc='upper left')
  plt.savefig('disasters.pdf',bbox_inches='tight')
  plt.show()


plot_disasters(disasters)



In [None]:
disasters['duration'] = disasters['end_date'] - disasters['start_date']
disasters.describe()

# Filter

In [None]:
del df['lat']
del df['lng']
del df['gender']
del df['temperature_avg']

In [None]:
filteredTweetsDF = df.loc[df['created_at'].dt.year >= 2007]
filteredTweetsDF = filteredTweetsDF.loc[filteredTweetsDF['topic'] != "Undefined / One Word Hashtags"]
#filteredTweetsDF['[O] sentiment'] = filteredTweetsDF['sentiment'].copy()
filteredTweetsDF['sentiment'] = filteredTweetsDF['sentiment'].apply(lambda x: "Positive Sentiment" if x >= 0.35 else ("Negative Sentiment" if x <= -0.35 else "Neutral Sentiment"))

filteredDisastersDF = disasters.loc[disasters['start_date'].dt.year >= 2007]
filteredDisastersDF = filteredDisastersDF.loc[filteredDisastersDF['end_date'].dt.year <= 2019]
filteredDisastersDF['duration'] = (filteredDisastersDF['end_date'] - filteredDisastersDF['start_date']).apply(lambda x: max(1.0, x / pd.Timedelta(days=1)))
filteredDisastersDF = filteredDisastersDF.loc[~filteredDisastersDF['Disaster Type'].isin(['Earthquake', 'Volcanic activity', 'Landslide', 'Mass movement (dry)'])]
filteredDisastersDF = filteredDisastersDF.loc[(filteredDisastersDF['Total Deaths'] / filteredDisastersDF['duration'] > 35) |
                                              (filteredDisastersDF['No Affected'] / filteredDisastersDF['duration'] > 400000) |
                                              (filteredDisastersDF["Total Damages ('000 US$)"] / filteredDisastersDF['duration'] > 400000)]

In [None]:
filteredTweetsDF

In [None]:
filteredDisastersDF.info()

In [None]:
filteredDisastersDF.describe()

# Discretize and Group

In [None]:
def discretizeTweets(df, granularity="day"):
    topicThreshold = 0.25
    sentimentThreshold = 0.4
    aggressivenesThreshold = 0.4
    deniersThreshold = 0.2

    df['date'] = df['created_at']

    if granularity == "day":
        dfGrouped = df.groupby(df['date'].dt.date)
    elif granularity == "week":
        dfGrouped = df.groupby(df['date'].dt.to_period('W'))
    else:
        dfGrouped = df.groupby(df['date'].dt.to_period('M'))

    sentimentPercentages = dfGrouped["sentiment"].value_counts(normalize=True)
    positiveSentimentPercentages = sentimentPercentages.loc[sentimentPercentages.index.get_level_values('sentiment') == 'Positive Sentiment']
    positiveSentimentPercentages = positiveSentimentPercentages.reset_index(level='sentiment', drop=True)
    negativeSentimentPercentages = sentimentPercentages.loc[sentimentPercentages.index.get_level_values('sentiment') == 'Negative Sentiment']
    negativeSentimentPercentages = negativeSentimentPercentages.reset_index(level='sentiment', drop=True)

    stancePercentages = dfGrouped["stance"].value_counts(normalize=True)
    deniersPercentages = stancePercentages.loc[stancePercentages.index.get_level_values('stance') == 'denier']
    deniersPercentages = deniersPercentages.reset_index(level='stance', drop=True)

    aggressivenessPercentages = dfGrouped["aggressiveness"].value_counts(normalize=True)
    aggressiveTweetsPercentages = aggressivenessPercentages.loc[aggressivenessPercentages.index.get_level_values('aggressiveness') == 'aggressive']
    aggressiveTweetsPercentages = aggressiveTweetsPercentages.reset_index(level='aggressiveness', drop=True)


    topicsPercentages = dfGrouped["topic"].value_counts(normalize=True)

    discreteTopics = {}
    for topic in topics:
        topicPercentages = topicsPercentages.loc[topicsPercentages.index.get_level_values('topic') == topic]
        topicPercentages = topicPercentages.reset_index(level='topic', drop=True)
        discreteTopics[topic] = topicPercentages > topicThreshold

    discretePositiveSentiment = positiveSentimentPercentages > sentimentThreshold
    discreteNegativeSentiment = negativeSentimentPercentages > sentimentThreshold
    discreteDeniers = deniersPercentages > deniersThreshold
    discreteAggressiveness = aggressiveTweetsPercentages > aggressivenesThreshold

    discreteDF = pd.DataFrame({
        "Positive Sentiment": discretePositiveSentiment,
        "Negative Sentiment": discreteNegativeSentiment,
        "Deniers": discreteDeniers,
        "Aggressiveness": discreteAggressiveness,
        "[T] Weather Extremes": discreteTopics["Weather Extremes"],
        "[T] Importance of Human Intervention": discreteTopics["Importance of Human Intervention"],
        "[T] Seriousness of Gas Emissions": discreteTopics["Seriousness of Gas Emissions"],
        "[T] Ideological Positions on Global Warming": discreteTopics["Ideological Positions on Global Warming"],
        "[T] Impact of Resource Overconsumption": discreteTopics["Impact of Resource Overconsumption"],
        "[T] Global stance": discreteTopics["Global stance"],
        "[T] Politics": discreteTopics["Politics"],
        "[T] Significance of Pollution Awareness Events": discreteTopics["Significance of Pollution Awareness Events"],
        "[T] Denialist Politicians versus Science": discreteTopics["Denialist Politicians versus Science"]
    })

    discreteDF = discreteDF.fillna(False)
    discreteDF = discreteDF.astype(float)
    #discreteDF = discreteDF.reset_index(level='date', drop=False)
    if granularity != "day":
      discreteDF.index = discreteDF.index.to_timestamp() + pd.DateOffset(days=-1)
    return discreteDF

#discreteTweetsDF = discretizeTweets(filteredTweetsDF, granularity="week")
#discreteTweetsDF.head()

#discretizeTweets(filteredTweetsDF, granularity="week")

In [None]:
def individual_tweets_to_tuples(df):
    result = []

    for index, row in df.iterrows():
        result.append(tuple(x for x in (
            "Positive Sentiment" if row["sentiment"] == "Positive Sentiment" else None,
            "Neutral Sentiment" if row["sentiment"] == "Neutral Sentiment" else None,
            "Negative Sentiment" if row["sentiment"] == "Negative Sentiment" else None,
            "Denier" if row["stance"] == "denier" else None,
            "Believer" if row["stance"] == "believer" else None,
            "Neutral" if row["stance"] == "neutral" else None,
            "Aggressive" if row["aggressiveness"] == "aggressive" else None,
            "Not Aggressive" if row["aggressiveness"] == "not aggressive" else None,
            "[T] Weather Extremes" if row["topic"] == "Weather Extremes" else None,
            "[T] Importance of Human Intervention" if row["topic"] == "Importance of Human Intervention" else None,
            "[T] Seriousness of Gas Emissions" if row["topic"] == "Seriousness of Gas Emissions" else None,
            "[T] Ideological Positions on Global Warming" if row["topic"] == "Ideological Positions on Global Warming" else None,
            "[T] Impact of Resource Overconsumption" if row["topic"] == "Impact of Resource Overconsumption" else None,
            "[T] Global stance" if row["topic"] == "Global stance" else None,
            "[T] Politics" if row["topic"] == "Politics" else None,
            "[T] Significance of Pollution Awareness Events" if row["topic"] == "Significance of Pollution Awareness Events" else None,
            "[T] Denialist Politicians versus Science" if row["topic"] == "Denialist Politicians versus Science" else None
        ) if x != None))

    return result


# def grouped_tweets_to_tuples(df):
#     result = []
#     for index, row in df.iterrows():
#         result.append(tuple(x for x in (
#             "Positive Sentiment" if row["Positive Sentiment"] else None,
#             "Negative Sentiment" if row["Negative Sentiment"] else None,
#             "Deniers" if row["Deniers"] else None,
#             "Aggressiveness" if row["Aggressiveness"] else None,
#             "[T] Weather Extremes" if row["[T] Weather Extremes"] else None,
#             "[T] Importance of Human Intervention" if row["[T] Importance of Human Intervention"] else None,
#             "[T] Seriousness of Gas Emissions" if row["[T] Seriousness of Gas Emissions"] else None,
#             "[T] Ideological Positions on Global Warming" if row["[T] Ideological Positions on Global Warming"] else None,
#             "[T] Impact of Resource Overconsumption" if row["[T] Impact of Resource Overconsumption"] else None,
#             "[T] Global stance" if row["[T] Global stance"] else None,
#             "[T] Politics" if row["[T] Politics"] else None,
#             "[T] Significance of Pollution Awareness Events" if row["[T] Significance of Pollution Awareness Events"] else None,
#             "[T] Donald Trump versus Science" if row["[T] Donald Trump versus Science"] else None
#         ) if x != None))

#     return result


In [None]:
# def individual_tweets_to_tuples(df):
#     df = df[['sentiment', 'aggressiveness', 'stance', 'topic']]
#     df['aggressiveness'] = df['aggressiveness'].copy().str.title()
#     df['stance'] = df['stance'].copy().str.title()
#     df['topic'] = '[T] ' + df['topic']

#     return [tuple(row) for index, row in df.iterrows()]

# individual_tweets_to_tuples(filteredTweetsDF)

In [None]:
filteredTweetsDF.head(100)

In [None]:
def discretize_disasters(filteredDisastersDF, granularity="day"):
  min_date = filteredDisastersDF['start_date'].min()
  max_date = filteredDisastersDF['end_date'].max()
  date_range = pd.date_range(min_date, max_date)


  new_df = pd.DataFrame({'date': date_range})
  new_df.set_index('date', inplace=True)


  for _, row in filteredDisastersDF.iterrows():
    disaster_type = row['Disaster Type']
    start_date = row['start_date']
    end_date = row['end_date']

    # Iterate through the dates within the disaster's date range
    for date in pd.date_range(start_date, end_date):
        new_df.at[date, disaster_type] = 1

  new_df.fillna(0, inplace=True)

  new_df.columns = ['[D] ' + column_name for column_name in new_df.columns]

  if granularity != "day":
    df = new_df.reset_index('date')

    if granularity == "week":
        dfGrouped = df.groupby(df['date'].dt.to_period('W'))
    else:
        dfGrouped = df.groupby(df['date'].dt.to_period('M'))

    discreteDF = pd.DataFrame({
        "[D] Storm": dfGrouped["[D] Storm"].sum() > 0,
        "[D] Flood": dfGrouped["[D] Flood"].sum() > 0,
        "[D] Extreme temperature": dfGrouped["[D] Extreme temperature"].sum() > 0,
        "[D] Wildfire": dfGrouped["[D] Wildfire"].sum() > 0,
        "[D] Drought": dfGrouped["[D] Drought"].sum() > 0
    })

    discreteDF = discreteDF.fillna(False)
    discreteDF = discreteDF.astype(float)
    discreteDF.index = discreteDF.index.to_timestamp() + pd.DateOffset(days=-1)
    return discreteDF
  else:
    return new_df

In [None]:
discreteDailyTweets = discretizeTweets(filteredTweetsDF, granularity="day")
discreteWeeklyTweets = discretizeTweets(filteredTweetsDF, granularity="week")

discreteDailyDisasters = discretize_disasters(filteredDisastersDF, granularity="day")
discreteWeeklyDisasters = discretize_disasters(filteredDisastersDF, granularity="week")

discreteDailyTweetsAndDisasters = discreteDailyTweets.merge(discreteDailyDisasters, left_index=True, right_index=True, how="outer").fillna(0.0)
discreteWeeklyTweetsAndDisasters = discreteWeeklyTweets.merge(discreteWeeklyDisasters, left_index=True, right_index=True, how="outer").fillna(0.0)

# Frequent Itemsets and Association Rules

In [None]:
import pickle

tweets_tuples_path = "/content/drive/My Drive/Dataset/tweets_tuples.pkl"

read_tuples_from_file = True
save_tuples_to_file = False

if read_tuples_from_file:
  with open(tweets_tuples_path, 'rb') as file:
    tweets_tuples = pickle.load(file)
else:
  tweets_tuples = individual_tweets_to_tuples(filteredTweetsDF)
  if save_tuples_to_file:
    with open(tweets_tuples_path, 'wb') as file:
        pickle.dump(tweets_tuples, file)

In [None]:
tweets_tuples

In [None]:
itemsets, rules = apriori(tweets_tuples, min_support=0.01, min_confidence=0.35)

In [None]:
def rules_to_df(rules):
  df = pd.DataFrame(
      [(rule.lhs, rule.rhs, len(rule.lhs) + len(rule.rhs), rule.confidence, rule.support, rule.lift, rule.conviction) for rule in rules],
      columns = ["lhs", "rhs", "size", "confidence", "support", "lift", "conviction"]
  )

  return df.sort_values(by="confidence", ascending=0)

def itemsets_to_df(itemsets):
  max_size = max(itemsets.keys())
  df = pd.DataFrame(
      [(itemSet, size, ocurrences / len(tweets_tuples)) for size in range(1, max_size + 1) for itemSet, ocurrences in itemsets[size].items()],
      columns = ["itemset", "size", "support"]
  )

  return df.sort_values(by="support", ascending=0)

df_rules = rules_to_df(rules)
df_itemsets = itemsets_to_df(itemsets)

In [None]:
df_itemsets.loc[df_itemsets['size'] == 3][["itemset", "support"]]

In [None]:
df_itemsets.loc[df_itemsets['size'] == 2][["itemset", "support"]]

In [None]:
def filter_itemsets(df_itemsets):
  excluded_values = ["Believer", "Not Aggressive", "Neutral Sentiment"]
  def filter_function(row):
      return all(item not in excluded_values for item in row["itemset"])
  return df_itemsets[df_itemsets.apply(filter_function, axis=1)]

df_itemsets_filtered = filter_itemsets(df_itemsets)

In [None]:
df_itemsets_filtered.loc[df_itemsets_filtered['size'] == 2][["itemset", "support"]]

In [None]:
df_itemsets_filtered.loc[df_itemsets_filtered['size'] == 3][["itemset", "support"]]

In [None]:
def split_label(label, max_length=16):
    #label = ", ".join([translate(term) for term in label.split(", ")])
    words = label.split()
    lines = []
    current_line = ""
    for word in words:
        if len(current_line) + len(word) + 1 <= max_length:
            current_line += " " + word
        else:
            lines.append(current_line.strip())
            current_line = word
    if current_line:
        lines.append(current_line.strip())
    return "\n".join(lines)

def plot_rules(df_rules, max_size=100, min_support=0.02, min_confidence=0.4, min_lift=1):
    dot = Digraph()
    title = f"Association Rules - min_support: {min_support}, min_confidence: {min_confidence}, min_lift: {min_lift}{f', max_size: {max_size}' if max_size != 100 else ''}"
    dot.attr(label=title, labelloc='t', fontsize="20")
    for _, rule in df_rules.iterrows():
        if (rule["size"] <= max_size) and (rule["support"] >= min_support) and (rule["confidence"] >= min_confidence) and (rule["lift"] > min_lift):
          src = split_label(', '.join([str(item) for item in rule["lhs"]]))
          dst = split_label(', '.join([str(item) for item in rule["rhs"]]))
          dot.node(src)
          dot.node(dst)
          dot.edge(src, dst, color="black", label="{:.4f}".format(rule["confidence"]))
    display(dot)
    #dot.render('digraph3', format='pdf')

plot_rules(df_rules, min_lift=1.45) #digraph1
plot_rules(df_rules, min_support=0.01, min_confidence=0.5, min_lift=1.45) #digraph2
plot_rules(df_rules, max_size=2, min_confidence=0.35, min_lift=1.25) #digraph3

# Causal Graphs

In [None]:
# def normalizeTweets(df, granularity="day"):
#     df['date'] = df['created_at']

#     if granularity == "day":
#         dfGrouped = df.groupby(df['date'].dt.date)
#     elif granularity == "week":
#         dfGrouped = df.groupby(df['date'].dt.to_period('W'))
#     else:
#         dfGrouped = df.groupby(df['date'].dt.to_period('M'))

#     stancePercentages = dfGrouped["stance"].value_counts(normalize=True)
#     deniersPercentages = stancePercentages.loc[stancePercentages.index.get_level_values('stance') == 'denier']
#     deniersPercentages = deniersPercentages.reset_index(level='stance', drop=True)

#     aggressivenessPercentages = dfGrouped["aggressiveness"].value_counts(normalize=True)
#     aggressiveTweetsPercentages = aggressivenessPercentages.loc[aggressivenessPercentages.index.get_level_values('aggressiveness') == 'aggressive']
#     aggressiveTweetsPercentages = aggressiveTweetsPercentages.reset_index(level='aggressiveness', drop=True)


#     topicsPercentages = dfGrouped["topic"].value_counts(normalize=True)

#     normalizedTopics = {}
#     for topic in topics:
#         topicPercentages = topicsPercentages.loc[topicsPercentages.index.get_level_values('topic') == topic]
#         topicPercentages = topicPercentages.reset_index(level='topic', drop=True)
#         normalizedTopics[topic] = topicPercentages

#     normalizedPositiveSentiment = dfGrouped.sentiment.mean().apply(lambda x: max(x, 0.0) ** (1/3))
#     normalizedNegativeSentiment = dfGrouped.sentiment.mean().apply(lambda x: abs(min(x, 0.0)) ** (1/3))
#     normalizedDeniers = deniersPercentages
#     normalizedAggressiveness = aggressiveTweetsPercentages

#     normalizedDF = pd.DataFrame({
#         "Positive Sentiment": normalizedPositiveSentiment,
#         "Negative Sentiment": normalizedNegativeSentiment,
#         "Deniers": normalizedDeniers,
#         "Aggressiveness": normalizedAggressiveness,
#         "[T] Weather Extremes": normalizedTopics["Weather Extremes"],
#         "[T] Importance of Human Intervention": normalizedTopics["Importance of Human Intervention"],
#         "[T] Seriousness of Gas Emissions": normalizedTopics["Seriousness of Gas Emissions"],
#         "[T] Ideological Positions on Global Warming": normalizedTopics["Ideological Positions on Global Warming"],
#         "[T] Impact of Resource Overconsumption": normalizedTopics["Impact of Resource Overconsumption"],
#         "[T] Global stance": normalizedTopics["Global stance"],
#         "[T] Politics": normalizedTopics["Politics"],
#         "[T] Significance of Pollution Awareness Events": normalizedTopics["Significance of Pollution Awareness Events"],
#         "[T] Denialist Politicians versus Science": normalizedTopics["Denialist Politicians versus Science"]
#     })

#     normalizedDF = normalizedDF.fillna(0.0)
#     #discreteDF = discreteDF.astype(float)
#     #discreteDF = discreteDF.reset_index(level='date', drop=False)
#     if granularity != "day":
#       normalizedDF.index = normalizedDF.index.to_timestamp() + pd.DateOffset(days=-1)
#     return normalizedDF

# #normalizedTweetsDF = normalizeTweets(filteredTweetsDF)
# #normalizedTweetsDF.head()

In [None]:
################
# DirectLiNGAM #
################
import lingam

def DirectLiNGAM(df):
     # adding lagged vars
    df_lag = df.iloc[:-1,:]
    df_nlag = df.iloc[1:,:]
    df = pd.DataFrame(np.hstack([df_nlag, df_lag]), columns=list(df.columns)+[f'{elem}_1' for elem in df.columns])

    # adding prior knowledge to model
    cant_var = df.shape[1]
    priori = np.zeros(shape=(cant_var,cant_var))
    priori[:int(cant_var/2),int(cant_var/2):] = np.ones(shape=(int(cant_var/2),int(cant_var/2)))*-1
    model = lingam.DirectLiNGAM(random_state=1, prior_knowledge=priori)

    # fit and save edges
    arcos = []
    model.fit(df)
    for row_idx, row in enumerate(model.adjacency_matrix_):
        dst = df.columns[row_idx]
        for column_idx, elem in enumerate(row):
            if elem!=0:
                src = df.columns[column_idx]
                arcos.append((src,dst))

    #should not exists edges from future to past Xi:1 -> Xj (beacuse was informed as prior)
    assert len([(src,dst) for (src,dst) in arcos if not '_' in src and '_' in dst  ])==0

    # eliminamos arcos contemporaneos en el pasado(Xj:1->Xi:1) y presente (Xq -> Xp)
    arcos = [(src,dst) for (src,dst) in arcos if not '_' in dst and '_' in src]


    graph = (set(df.columns),set(arcos))
    return graph


#########
# PCMCI #
#########
from tigramite.pcmci import PCMCI
from tigramite.independence_tests import parcorr
import tigramite.data_processing as pp

def pcmci(df):
    t_df = pp.DataFrame(df.values,var_names=df.columns)
    cond_ind_test = parcorr.ParCorr()
    pcmci = PCMCI(dataframe=t_df, cond_ind_test=cond_ind_test)
    results = pcmci.run_pcmci(tau_max=1, pc_alpha=None)

    var_count = len(df.columns)
    arcos = []
    m = results['p_matrix']
    for src_idx in range(var_count):
        src = df.columns[src_idx]
        for dst_idx in range(var_count):
            dst = df.columns[dst_idx]
    #                         if m[src_idx,dst_idx,0]<0.05: #src -> dst
    #                             arcos.append((src,dst))
            if m[src_idx,dst_idx,1]<0.05: #src:1 -> dst
                arcos.append((f'{src}_1',dst))

    graph = (set(df.columns),set(arcos))
    return graph


#######
# VAR #
#######
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic

def var(df, lag=1):
    model = VAR(df)
    model_fitted = model.fit(lag)

    nodes = [elem for elem in model_fitted.pvalues.columns] + [f'{elem}_1' for elem in model_fitted.pvalues.columns]
    arcos = []
    for row in range(1,len(model_fitted.pvalues)):
        src = model_fitted.pvalues.index[row]
        for column in range(len(model_fitted.pvalues.columns)):
            dst = model_fitted.pvalues.columns[column]
            if model_fitted.pvalues.iloc[row,column] < 0.05:
                arcos.append((src,dst))
    arcos = [(f"{src.replace('L1.','')}_1" ,dst) for src,dst in arcos]

    graph = (set(nodes), set(arcos))
    return graph


######
# PC #
######
from tigramite.pcmci import PCMCI
from tigramite.independence_tests import parcorr
import tigramite.data_processing as pp

def pc(df):
        t_df = pp.DataFrame(df.values,var_names=df.columns)
        cond_ind_test = parcorr.ParCorr()
        pcmci = PCMCI(dataframe=t_df, cond_ind_test=cond_ind_test)
        all_parents = pcmci.run_pc_stable(tau_max=1, pc_alpha=None)

        var_count = len(df.columns)
        arcos = []

        for dst_idx in range(var_count):
            dst = df.columns[dst_idx]
            for parent in all_parents[dst_idx]:
                src_idx,lag = parent
                assert lag==-1
                src = df.columns[src_idx]
                arcos.append((f'{src}_1',dst))
#                     for dst_idx in range(var_count):
#                         dst = df.columns[dst_idx]
# #                         if m[src_idx,dst_idx,0]<0.05: #src -> dst
# #                             arcos.append((src,dst))
#                         if m[src_idx,dst_idx,1]<0.05: #src:1 -> dst


        graph = (set(df.columns),set(arcos))
        return graph



############
# ENSEMBLE #
############
def ensemble(df,methods):
    vertices,edges = methods[0](df)
    for method in methods[1:]:
        v,e = method(df)
        edges=e.intersection(edges)
    return vertices,edges

def ensemble_k(graphs,k):
    edgesOccurrences = {}
    for v, e in graphs:
        for edge in e:
          if edgesOccurrences.get(edge):
            edgesOccurrences[edge] += 1
          else:
            edgesOccurrences[edge] = 1
    edges = []
    for key in edgesOccurrences.keys():
      if edgesOccurrences[key] >= k:
        edges.append(key)
    return v,edges


#########
# GRAPH #
#########
def split_label(label, max_length=16):
    # Split the label into lines based on max_length characters
    #label = translate(label) # ELIMINAR ESTO ---------------------------------------------------------------
    words = label.split()
    lines = []
    current_line = ""
    for word in words:
        if len(current_line) + len(word) + 1 <= max_length:
            current_line += " " + word
        else:
            lines.append(current_line.strip())
            current_line = word
    if current_line:
        lines.append(current_line.strip())
    return "\n".join(lines)

def temp2contemp(grafo):
    vertices, edges = grafo
    new_vertices = set([vertex.replace('_1','') for vertex in vertices ])
    new_edges = set([(src.replace('_1',''),dst) for src,dst in edges if src.replace('_1','')!= dst])
    return new_vertices,new_edges
def plot_graph(graph, title=""):
    _, edges = graph

    def filter_vertices(edges):
      vertices = set()
      for edge in edges:
        s, t = edge
        vertices.add(s)
        vertices.add(t)
      return list(vertices)

    vertices = filter_vertices(edges)

    dot = Digraph()
    dot.attr(label=title, labelloc='t', fontsize="20")
    for node in vertices:
        dot.node(split_label(node))
    for src,dst in edges:
        dot.edge(split_label(src),split_label(dst), color="black")

    display(dot)
    dot.render(title, format='pdf')
    return dot

In [None]:
def curateGraph(graph):
  forbidenSources = []
  forbiddenTargets = ["[D] Storm", "[D] Flood", "[D] Extreme temperature", "[D] Wildfire", "[D] Drought"]
  forbiddenEdges = [("Positive Sentiment", "Negative Sentiment"), ("Negative Sentiment", "Positive Sentiment")]

  def isValidEdge(edge):
    source, target = edge
    return not (source in forbidenSources or target in forbiddenTargets or edge in forbiddenEdges)

  vertices, edges = graph
  curatedEdges = [edge for edge in edges if isValidEdge(edge)]
  return vertices, curatedEdges

def generateCausalGraph(ks=[4], granularity="day"):
    if granularity == "day":
      dfToPlot = discreteDailyTweetsAndDisasters
    else:
      dfToPlot = discreteWeeklyTweetsAndDisasters

    freq = 'D' if granularity == "day" else 'W' if granularity == "week" else 'M'
    dfToPlot = dfToPlot.reindex(pd.date_range(start=dfToPlot.index.min(), end=dfToPlot.index.max(), freq=freq), fill_value=0.0)

    DirectLiNGAM_graph = DirectLiNGAM(dfToPlot)
    pcmi_graph = pcmci(dfToPlot)
    pc_graph = pc(dfToPlot)
    var_graph = var(dfToPlot)


    for k in ks:
      title = f"Causal Graph - k={k}, granularity={granularity}"
      graph = ensemble_k([DirectLiNGAM_graph, pcmi_graph, pc_graph, var_graph], k)
      graph = temp2contemp(graph)
      #curatedGraph = curateGraph(graph)
      plot_graph(graph, title)
      #plot_graph(curatedGraph, "(Curated) " + title)



In [None]:
for granularity in ["day", "week"]:
  generateCausalGraph(ks=[2], granularity=granularity)

# Tests

In [None]:
def translate(term):
  terms = {
      "[T] Weather Extremes": "[T] Extremos Climáticos",
      "[T] Importance of Human Intervention": "[T] Importancia de la Intervención Humana",
      "[T] Seriousness of Gas Emissions": "[T] Gravedad de las Emisiones de Gas",
      "[T] Ideological Positions on Global Warming": "[T] Posiciones Ideológicas respecto al Calentamiento Global",
      "[T] Impact of Resource Overconsumption": "[T] Impacto del Sobreconsumo de Recursos",
      "[T] Global stance": "[T] Postura Global",
      "[T] Politics": "[T] Política",
      "[T] Significance of Pollution Awareness Events": "[T] Importancia de los Eventos de Concientización",
      "[T] Denialist Politicians versus Science": "[T] Políticos Negacionistas vs la Ciencia",
      "[T] Undefined / One Word Hashtags": "[T] Indefinido / Hashtags de Una Palabra",
      "[D] Drought": "[D] Sequía",
      "[D] Storm": "[D] Tormenta",
      "[D] Flood": "[D] Inundación",
      "[D] Extreme temperature": "[D] Temperatura Extrema",
      "[D] Wildfire": "[D] Incendio Forestal",
      "Positive Sentiment": "Sentimiento Positivo",
      "Negative Sentiment": "Sentimiento Negativo",
      "Neutral Sentiment": "Sentimiento Neutral",
      "Aggressiveness": "Agresividad",
      "Aggressive": "Agresivo",
      "Not Aggressive": "No Agresivo",
      "Believer": "Creyente",
      "Neutral": "Neutral",
      "Denier": "Negacionista",
      "Deniers": "Negacionistas"
  }

  return terms.get(term, term)

translate("Denier")