#!/usr/bin/env python
# coding: utf-8

# In[2]:


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import re
import yfinance as yf
import holidays
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import adam_v2
Adam = adam_v2.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# In[3]:


import torch.nn.functional as F
def SentimentAnalyzer(doc):
    pt_batch = tokenizer(doc,padding=True,truncation=True,max_length=512,return_tensors="pt")
    outputs = model(**pt_batch)
    pt_predictions = F.softmax(outputs.logits, dim=-1)
    return pt_predictions.detach().cpu().numpy()


# In[8]:


data1 = pd.read_csv('analyst_ratings_processed.csv', index_col=0)
data1.dropna(inplace = True)
data1.rename(columns={'stock':'ticker'}, inplace=True)
data1['date'] = data1['date'].apply(lambda x : x.split()[0])
data2 = pd.read_csv('us_equities_news_dataset.csv', index_col=0)
data2.dropna(inplace = True)
data2.reset_index(drop=True, inplace=True)
data2.rename(columns={'release_date':'date'}, inplace=True)
data2.drop(inplace=True, columns=['category', 'content', 'provider', 'url', 'article_id'], axis=1)


# In[31]:


data = pd.concat([data1, data2])
data.drop_duplicates(subset='title', keep='first', inplace=True)


# In[32]:


data


# In[41]:


tickerSymbol = "MSFT"
tmpData = {}
total = data['date'].nunique()
for i in tqdm(data[data['ticker']==tickerSymbol]['date'].unique()):
    tmpData[i] = data.loc[(data['ticker']==tickerSymbol) & (data['date'] == i)]['title'].tolist()


# In[42]:


ONE_DAY = datetime.timedelta(days=1)
HOLIDAYS_US = holidays.US()
def next_business_day(dateString):
    datetimeObj = datetime.datetime.strptime(dateString, '%Y-%m-%d')
    next_day = datetimeObj + ONE_DAY
    while next_day.weekday() in holidays.WEEKEND or next_day in HOLIDAYS_US:
        next_day += ONE_DAY
    return next_day


# In[43]:


def findPercentageBySentences(sentenceList):
    posAvg, negAvg, neuAvg = 0, 0, 0
    sentimentArr = SentimentAnalyzer(sentenceList)
    sentimentArr = np.mean(sentimentArr, axis=0)
    posAvg=sentimentArr[0]
    negAvg=sentimentArr[1]
    neuAvg=sentimentArr[2]
    return {'numArticles': len(sentenceList), 'pos': posAvg, 'neg': negAvg, 'neu' : neuAvg}


# In[44]:


dateSentimentGroups = {}
for i in tqdm(tmpData):
    scores = findPercentageBySentences(tmpData[i])
    dateSentimentGroups[i] = scores


# In[46]:


import efinance as ef
data = []
stock_code = 'MSFT'
# 开始日期
beg = '20100723'
# 结束日期
end = '20210708'
hist = ef.stock.get_quote_history(stock_code, beg=beg, end=end)

for i in tqdm(dateSentimentGroups):
    start = i
    nextDay = next_business_day(start).strftime("%Y-%m-%d")
    prevDay = pd.DataFrame(hist[hist["日期"]==start])
    nextDay = pd.DataFrame(hist[hist["日期"]==nextDay])
    data.append([start,dateSentimentGroups[i]['numArticles'], dateSentimentGroups[i]['neu'], dateSentimentGroups[i]['pos'], dateSentimentGroups[i]['neg']])


# In[47]:


df = pd.DataFrame(columns =['Date','numArticles', 'neutral', 'positive','negative'], data=data)


# In[48]:


hist = hist.rename(columns={'日期':'Date'})
hist


# In[49]:


df_MSFT= pd.merge(hist,df,on=["Date"])


# In[51]:


df_MSFT.to_csv('MSFT.csv')


# In[9]:


import math
import numpy as np
import pandas as pd
from numpy.random import seed
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import keras
import tensorflow


# In[10]:


df = pd.read_csv('./aapl.csv',sep=',') 


# In[ ]:




