{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "source": [
    "#### import pandas as pd\n",
    "import numpy as np\n",
    "import pandas_profiling\n",
    "from ummalqura.hijri_date import HijriDate\n",
    "from dateutil.relativedelta import relativedelta\n",
    "from datetime import date, datetime\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from xgboost import XGBClassifier\n",
    "from pprint import pprint\n",
    "import numpy as np\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.metrics import make_scorer\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import precision_recall_curve\n",
    "import matplotlib\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import f1_score\n",
    "import matplotlib.pyplot as plt\n",
    "import researchpy as rp\n",
    "from scipy import stats\n",
    "from sklearn.metrics import matthews_corrcoef, roc_curve, roc_auc_score\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "from sklearn.metrics import r2_score\n",
    "from rfpimp import *\n",
    "MIN_APT_DATE = '2019-01-01'\n",
    "date_format1 = '%d/%m/%Y'\n",
    "date_format2 = '%d/%m/%y'\n",
    "time_format = '%I:%M %p'\n",
    "INPUT_DIR = 'data/'\n",
    "RES_DIR= 'results/'\n",
    "STORED_DATE_FORMAT = '%Y-%m-%d'\n",
    "STORED_TIME_FORMAT = '%H:%M:%S'\n",
    "dateCol = 'Appointment Date'\n",
    "timeCol='Appointment Time'\n",
    "fileCol = 'Patient File ID'\n",
    "attendCol = 'Attend'\n",
    "doctorCol='Doctor ID'\n",
    "bdCol='Date of birth'\n",
    "bookCol='Booking Date and Time'\n",
    "posLabel=\"NoShow\"\n",
    "labels=[\"NoShow\",\"Show\"]\n",
    "time_format = '%I:%M %p' \n",
    "CSV_DT_FORMAT='%Y-%m-%d %H:%M:%S'\n",
    "FIGS_DIR = 'figs/'\n",
    "desired_width = 20000\n",
    "pd.set_option('display.width', desired_width)\n",
    "pd.set_option('display.max_rows', 100)\n",
    "pd.set_option('display.max_columns', 50)\n",
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "\n",
    "def f(s):\n",
    "    numeric_filter = filter(str.isdigit, s)\n",
    "    numeric_string = \"\".join(numeric_filter)\n",
    "    return numeric_string\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "def train(X_train,y_train, selection):\n",
    "    #print(\"Start Training...\")\n",
    "    f1_scorer = make_scorer(f1_score, pos_label=posLabel)\n",
    "    score=\"f1\"\n",
    "##XGB\n",
    "    xgb_search_grid= {\n",
    "                   'eta': [0.1],\n",
    "                   'n_estimators': [128],\n",
    "                   'min_child_weight':[6],\n",
    "                   'gamma':[0.3],\n",
    "                   'colsample_bytree':[0.8],\n",
    "                   'scale_pos_weight':[1],\n",
    "                   'max_depth': [10],\n",
    "                   'subsample': [0.8]}\n",
    "    xgbc=XGBClassifier()\n",
    "    \n",
    "##GB\n",
    "    gb_search_grid = {'loss':['deviance'],\n",
    "                   'learning_rate': [1],\n",
    "                   'n_estimators': [500],\n",
    "                   'max_depth':  [2],\n",
    "                   'min_samples_split': [2],\n",
    "                   'min_samples_leaf': [1],\n",
    "                   'max_features': ['auto'],\n",
    "                   'subsample': [0.9]}\n",
    "    gbc = GradientBoostingClassifier()\n",
    "\n",
    "##LG\n",
    "    lg_search_grid={'penalty': ['l2'],'C':[0.001], 'max_iter':[500]}\n",
    "    lgc=LogisticRegression()\n",
    "##RF\n",
    "\n",
    "    rf_search_grid = {'n_estimators': [100],\n",
    "                   'max_features': ['auto'],\n",
    "                   'max_depth': [10],\n",
    "                   'min_samples_split': [70],\n",
    "                   'min_samples_leaf': [70],\n",
    "                   'bootstrap': [True]}\n",
    "\n",
    "    rfc = RandomForestClassifier()\n",
    "#######SELECTION##########\n",
    "    if selection.lower()=='l':\n",
    "        print(\"Logistic Regression is selected\")\n",
    "        selected_grid=lg_search_grid\n",
    "        selected_model=lgc\n",
    "    elif selection.lower()=='g':\n",
    "        print(\"Gradient Boosting is selected\")\n",
    "        selected_grid=gb_search_grid\n",
    "        selected_model=gbc\n",
    "    elif selection.lower()=='r':\n",
    "        print(\"Random Forest is selected\")\n",
    "        selected_grid=rf_search_grid\n",
    "        selected_model=rfc\n",
    "    else:\n",
    "        exit(0)\n",
    "    \n",
    "#######SELECTION##########\n",
    "  \n",
    "    grid_search = GridSearchCV(estimator=selected_model, param_grid=selected_grid, cv=2, verbose=2,\n",
    "                               n_jobs=-3, scoring=f1_scorer)\n",
    "\n",
    "    grid_search.fit(X_train, y_train)\n",
    "    \n",
    "  \n",
    "    \n",
    "    return grid_search\n",
    "    \n",
    "def testResults(grid_search,X_test,y_test, file):\n",
    "    #print(\"Classes:\")\n",
    "    #print(grid_search.classes_)\n",
    "    #pprint(grid_search.best_params_)\n",
    "    #print(grid_search.best_score_)\n",
    "    best_grid = grid_search.best_estimator_\n",
    "\n",
    "    \n",
    "    #MM\n",
    "    prob = best_grid.predict_proba(X_test)\n",
    "    y_predicted_positive = list(zip(*prob))[0] #NoSow\n",
    "    y_predicted_negative = list(zip(*prob))[1] # Show\n",
    "\n",
    "\n",
    "    precision, recall, thresholds = precision_recall_curve(y_test, y_predicted_positive, pos_label=posLabel)\n",
    "\n",
    "\n",
    "\n",
    "    f1_scores = 2*recall*precision/(0.00001+(recall+precision))\n",
    "    #print('Best threshold: ', thresholds[np.argmax(f1_scores)])\n",
    "    f1=np.max(f1_scores)\n",
    "    auc=roc_auc_score(y_test,y_predicted_negative, labels=labels)\n",
    "    thresholds = np.append(thresholds, 1)\n",
    "    \n",
    "    \n",
    "    print (\"Results for:\"+file)\n",
    "    \n",
    "    print(\"AUC=\",auc)\n",
    "    print(\"F1=\",f1)\n",
    "    \n",
    "    df=pd.DataFrame(\n",
    "    {'precision': precision,\n",
    "     'recall': recall,\n",
    "     'thresholds': thresholds\n",
    "    })\n",
    "    \n",
    "    df.to_csv(RES_DIR + file+'.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "    \n",
    "    return f1,auc,precision,recall,thresholds\n",
    "    \n",
    "  \n",
    "\n",
    "    \n",
    "    \n",
    "    \n",
    "\n",
    "def processDuration(independents, dependent):\n",
    "    print(\"Count before=\",len(dependent) )\n",
    "    print(\"Start processing duration...\")\n",
    "    count=0\n",
    "    for i in independents.index:\n",
    "        count = count + 1\n",
    "        if count % 1000 == 0:\n",
    "            print(str(count) + \" record processed...\")\n",
    "        dur = independents['appointment Duration'][i]\n",
    "        if dur < 60:\n",
    "            independents.drop(index=i, axis=0, inplace=True)\n",
    "            dependent.drop(index=i, axis=0, inplace=True)\n",
    "    \n",
    "    print(\"Count After=\",len(dependent) )\n",
    "    if len(dependent) != independents.shape[0]:\n",
    "        print(\"Error: Dependent and Independent not matching\")\n",
    "    return independents, dependent\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "def train_test(df):\n",
    "    # Get the Dependent and Independent Features.\n",
    "    X = df.drop(['Attend'], axis=1)\n",
    "    y = df['Attend']\n",
    "\n",
    "    # Split into 90% train and 10% test\n",
    "    return train_test_split(X, y, test_size=0.1, shuffle=False)\n",
    "\n",
    "def handleSameDayAppts(df):\n",
    "    \n",
    "    df = df.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n",
    "    count=0\n",
    "    filePrev=\"\"\n",
    "    aptDatePrev=0\n",
    "    docPrev=\"\"\n",
    "    fixed=0\n",
    "    prevIndex=-1\n",
    "    for i in df.index:\n",
    "        count = count + 1\n",
    "        if count % 10000 == 0:\n",
    "            print(str(count) + \" record processed...\")\n",
    "        \n",
    "        file1 = df[fileCol][i]\n",
    "        aptDate = df[dateCol][i]\n",
    "        doc = df[doctorCol][i]\n",
    "\n",
    "        \n",
    "        if filePrev== file1 and aptDatePrev==aptDate:\n",
    "            fixed=fixed+1\n",
    "            # sanity check\n",
    "            if df['MultiAppointments'][i] !=True:\n",
    "                print(\"Multi=\"+str(df['MultiAppointments'][i])+\" Error: expecting multiple appointments for file \"+str(file1) + \" on \"+str(aptDate))\n",
    "            df.at[i, 'NoShow10'] = df.at[prevIndex, 'NoShow10']\n",
    "            df.at[i, 'NoShow10Count'] = df.at[prevIndex, 'NoShow10Count']\n",
    "            df.at[i, 'NoShowAll'] = df.at[prevIndex, 'NoShowAll']\n",
    "            df.at[i, 'NoShowAllCount'] = df.at[prevIndex, 'NoShowAllCount']\n",
    "            \n",
    "            \n",
    "            \n",
    "            df.at[i, 'AttendCat3'] = df.at[prevIndex, 'AttendCat3']\n",
    "            df.at[i, 'AttendCat5'] = df.at[prevIndex, 'AttendCat5']\n",
    "            df.at[i, 'AttendCat7'] = df.at[prevIndex, 'AttendCat7']\n",
    "            df.at[i, 'AttendCat10'] = df.at[prevIndex, 'AttendCat10']\n",
    "            \n",
    "            \n",
    "            df.at[i, 'DaysSinceLastApt'] = df.at[prevIndex, 'DaysSinceLastApt']\n",
    "            df.at[i, 'NumOfPrevVisits'] = df.at[prevIndex, 'NumOfPrevVisits']\n",
    "            if doc==docPrev:\n",
    "                df.at[i, 'NumOfPrevVisitsWithThisDoctor'] = df.at[prevIndex, 'NumOfPrevVisitsWithThisDoctor']\n",
    " \n",
    "\n",
    "            \n",
    "            \n",
    "    \n",
    "            \n",
    "        \n",
    "        \n",
    "        doctorPrev=doc\n",
    "        filePrev = file1\n",
    "        aptDatePrev = aptDate\n",
    "        prevIndex=i\n",
    "        \n",
    "    print(\"Fixed=\"+str(fixed))\n",
    "    return df\n",
    "\n",
    "def crosstab(df,featuresToPlot):\n",
    "    for i in aptDataFrame.columns:\n",
    "        print(i)\n",
    "        table, results = rp.crosstab(aptDataFrame[i], aptDataFrame['Attend'], prop='col',\n",
    "                                     test='chi-square',\n",
    "                                     correction=False)\n",
    "        print(results)\n",
    "        \n",
    "def generateReport(df, path):\n",
    "    prof=pandas_profiling.ProfileReport(df)\n",
    "    print(\"Save to file..\")\n",
    "    prof.to_file(output_file=path)\n",
    "    print(\"Save to file complete\")\n",
    "    rejected_features = list(prof.get_rejected_variables())\n",
    "    print(rejected_features)\n",
    "    \n",
    "def readDatatypes(df):\n",
    "    if attendCol in df.columns:\n",
    "        df[attendCol] = df[attendCol].apply(str) \n",
    "    if timeCol in df.columns:\n",
    "        df[timeCol] = pd.to_datetime(df[timeCol], format=CSV_DT_FORMAT)\n",
    "    if bdCol in df.columns:\n",
    "        df[bdCol] = pd.to_datetime(df[bdCol], format=CSV_DT_FORMAT)\n",
    "    if bookCol in df.columns:\n",
    "        df[bookCol] = pd.to_datetime(df[bookCol], format=CSV_DT_FORMAT)\n",
    "    if dateCol in df.columns:\n",
    "        df[dateCol] = pd.to_datetime(df[dateCol], format=CSV_DT_FORMAT)\n",
    "    if doctorCol in df.columns:\n",
    "        df[doctorCol] = df[doctorCol].apply(str)        \n",
    "        \n",
    "    if 'AttendCat3' in df.columns:\n",
    "        df['AttendCat3'] = df['AttendCat3'].apply(str)\n",
    "    if 'AttendCat5' in df.columns:\n",
    "        df['AttendCat5'] = df['AttendCat5'].apply(str)\n",
    "    if 'AttendCat7' in df.columns:\n",
    "        df['AttendCat7'] = df['AttendCat7'].apply(str)\n",
    "    if 'AttendCat10' in df.columns:\n",
    "        df['AttendCat10'] = df['AttendCat10'].apply(str)\n",
    "        \n",
    "\n",
    "        \n",
    "    if 'MultiAppointments' in df.columns:\n",
    "        df['MultiAppointments'] = df['MultiAppointments'].apply(bool)\n",
    "    if 'Hour' in df.columns:\n",
    "        df['Hour'] = df['Hour'].apply(str)\n",
    "\n",
    "    \n",
    "\n",
    "        \n",
    "    return df\n",
    "\n",
    "def fixDatatypes(df):\n",
    "    \n",
    "    df[timeCol] = pd.to_datetime(df[timeCol], format=time_format)\n",
    "    df[bdCol] = pd.to_datetime(df[bdCol], format=STORED_DATE_FORMAT)\n",
    "    df[bookCol] = pd.to_datetime(df[bookCol], format=STORED_DATE_FORMAT)\n",
    "    df[dateCol] = pd.to_datetime(df[dateCol], format=STORED_DATE_FORMAT)\n",
    "    df[doctorCol] = df[doctorCol].apply(str)\n",
    "    return df\n",
    "\n",
    "def convertDateTimeCols(df,dateCols, timeCols,dateformat,timeformat):\n",
    "    for i in dateCols:\n",
    "        df[i] = pd.to_datetime(df[i], format=dateformat)\n",
    "\n",
    "    for i in timeCols:\n",
    "        df[i] = pd.to_datetime(df[i], format=timeformat)\n",
    "    return df\n",
    "\n",
    "def timeWithinWork(time, ramadan):\n",
    "    workStart=datetime.strptime(\"1900-01-01 09:00:00\", time_format) #9\n",
    "    workEnd = datetime.strptime(\"12:00 AM\", time_format) #0\n",
    "    if ramadan == True:\n",
    "     workStart=datetime.strptime(\"12:00 PM\", time_format) #12\n",
    "     workEnd = datetime.strptime(\"3:00 AM\", time_format) #3\n",
    "\n",
    "    if time >workEnd and time < workStart:\n",
    "           # print(\"Ramadan=\"+str(ramadan)+\" t=\"+str(time)+\" is out of range\")\n",
    "            return False\n",
    "    return True\n",
    "\n",
    "\n",
    "\n",
    "def readWeatherData(weatherDataFrame, aptDate, aptTime, colName):\n",
    "\n",
    "    weatherData = weatherDataFrame[(weatherDataFrame['Date'] == aptDate)]\n",
    "    dateFound= True\n",
    "    if weatherData.shape[0] != 1:\n",
    "        dateFound= False\n",
    "    weatherData = weatherData[(weatherData['Hour'] == aptTime.time().hour)]\n",
    "    if weatherData.shape[0] > 1:\n",
    "        print(str(aptDate) + \":\" + str(aptTime))\n",
    "        print(\"Error:\" + str(weatherData.shape[0]))\n",
    "        print(weatherData)\n",
    "        return float('nan')\n",
    "    elif weatherData.shape[0] == 0:\n",
    "        ## Future or very old\n",
    "        print(aptDate)\n",
    "        print(aptTime.hour)\n",
    "        print('Date Found:'+ str(dateFound))\n",
    "        exit(0)\n",
    "        return float('nan')\n",
    "    else:\n",
    "        return weatherData.iloc[0][colName]\n",
    "\n",
    "\n",
    "def isAroundHoliday(date):\n",
    "    um = HijriDate(date.year, date.month, date.day, gr=True)\n",
    "    if um.month == 9:\n",
    "        return True\n",
    "    elif um.month == 10 and um.day <= 10:\n",
    "        return True\n",
    "    elif um.month == 12 and um.day <= 20:\n",
    "        return True\n",
    "    return False\n",
    "\n",
    "\n",
    "def isRamadan(date):\n",
    "    um = HijriDate(date.year, date.month, date.day, gr=True)\n",
    "    if um.month == 9:\n",
    "        return True\n",
    "    \n",
    "    return False\n",
    "\n",
    "def readBirthDate(datestr,date_format):\n",
    "    try:\n",
    "        datevalue = datetime.strptime(dob, dateformat_dob).date()\n",
    "    except ValueError:\n",
    "        arr = dob.split('/')\n",
    "        year = int(arr[len(arr) - 1])\n",
    "        datevalue = datetime.strptime('1/1/'+str(year), dateformat_dob).date()\n",
    "       \n",
    "    \n",
    "    #check Hijri\n",
    "    arr = datevalue.strftime(dateformat_dob).split('/')\n",
    "    year = int(arr[len(arr) - 1])\n",
    "    if 1500 > year > 1300:\n",
    "        yearg=round(year*0.97+622)\n",
    "        datevalue = datetime.strptime('1/1/'+str(yearg), dateformat_dob).date()\n",
    "    if datevalue.year>2022:\n",
    "        print(\"Error: Input date is \"+datestr+ \" output is \"+str(datevalue))\n",
    "        exit()\n",
    "    return datevalue\n",
    "\n",
    "def fillAttend(df):\n",
    "    total=0\n",
    "    countAttend=0\n",
    "\n",
    "    for i in apptDF.index:\n",
    "        total = total + 1\n",
    "        attend=\"0\"\n",
    "        if apptDF['SHOW OR NOSHOW'][i]=='SHOW':\n",
    "            attend=\"1\"\n",
    "            countAttend=countAttend+1\n",
    "        df.at[i, 'Attend'] = attend\n",
    "    #print('Total Records='+ str(total))\n",
    "    #print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n",
    "    return df\n",
    "\n",
    "def calacNoShowMarkov(df, fileCol, dateCol, attendCol):\n",
    "    df[attendCol] = df[attendCol].astype(str)\n",
    "    count = 0\n",
    "    X_train, X_test, y_train, y_test = train_test(df)\n",
    "    undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "    X_train, y_train = undersample.fit_resample(X_train, y_train)\n",
    "    X_train[attendCol] = y_train\n",
    "    df2= X_train\n",
    " \n",
    "    history = dict()\n",
    "    for i in df2.index:\n",
    "        count = count + 1\n",
    "        if count % 10000 == 0:\n",
    "            print(str(count) + \" record processed...\")\n",
    "        seq=df2.at[i, 'AttendCat10']\n",
    "        seq=seq.strip()\n",
    "        attend=df2.at[i, attendCol]\n",
    "        if seq==\"'-1'\":\n",
    "            seq=\"'\"+attend+\"'\"\n",
    "        else:\n",
    "            seq=seq[:-1] + attend + \"'\"\n",
    "        \n",
    "        \n",
    "        if seq in history:\n",
    "            c1=history[seq]\n",
    "            history[seq]=c1+1\n",
    "        else:\n",
    "            history[seq]=1\n",
    "        #print(\"History for:\"+ seq+\"=\",history[seq])\n",
    "            \n",
    "    count = 0\n",
    "    print(\"Adding Markov probs\")\n",
    "    \n",
    "    for i in df.index:\n",
    "        countn=0\n",
    "        counts=0\n",
    "        prob=0\n",
    "        count = count + 1\n",
    "        if count % 10000 == 0:\n",
    "            print(str(count) + \" record processed...\")\n",
    "        \n",
    "        seq=df.at[i, 'AttendCat10']\n",
    "        seq=seq.strip()\n",
    "        attend=df.at[i, attendCol]\n",
    "        \n",
    "    \n",
    "        if seq!=\"'-1'\":\n",
    "            seqn=seq[:-1] + \"0'\"\n",
    "            seqs=seq[:-1] + \"1'\"\n",
    "            counts=0\n",
    "            countn=0\n",
    "            if seqn in history:\n",
    "                #print(\"Found\")\n",
    "                countn=history[seqn]\n",
    "            if seqs in history:\n",
    "                #print(\"Found\")\n",
    "                counts=history[seqs]\n",
    "            total= countn + counts\n",
    "\n",
    "            if total!= 0:\n",
    "                prob= countn/total\n",
    "\n",
    "\n",
    "        df.at[i, 'Markov']=prob\n",
    "    \n",
    "\n",
    "        \n",
    "        \n",
    "        \n",
    "       \n",
    "\n",
    "        \n",
    "        \n",
    "        \n",
    "            \n",
    "        \n",
    "        \n",
    "        \n",
    "    \n",
    "def calacNoShow(df, fileCol, dateCol, attendCol):\n",
    "    df[attendCol] = df[attendCol].astype(str)\n",
    "    noShowCount = dict()\n",
    "    allCount = dict()\n",
    "    df = df.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n",
    "    count = 0\n",
    "    for i in df.index:\n",
    "        count = count + 1\n",
    "\n",
    "        if count % 1000 == 0:\n",
    "            print(str(count) + \" record processed...\")\n",
    "      \n",
    "            \n",
    "       \n",
    "        file1 = df[fileCol][i]\n",
    "        aptDate = df[dateCol][i]\n",
    "        noShow = 0\n",
    "        allCount1 = 0\n",
    "        noShowCount1 = 0\n",
    "\n",
    "        if file1 in allCount:\n",
    "            noShow = noShowCount[file1] / allCount[file1]\n",
    "            allCount1 = allCount[file1]\n",
    "            noShowCount1 = noShowCount[file1]\n",
    "        attend = df[attendCol][i]\n",
    "        allCount1 = allCount1 + 1\n",
    "        df.at[i, 'NoShowAllCount'] = noShowCount1\n",
    "        if attend == \"0\":\n",
    "            noShowCount1 = noShowCount1 + 1\n",
    "        allCount[file1] = allCount1\n",
    "        noShowCount[file1] = noShowCount1\n",
    "        df.at[i, 'NoShowAll'] = noShow\n",
    "\n",
    "        # %% This code calculate NoShow\n",
    "\n",
    "        rslt_df = df[(df[fileCol] == file1) & (df[dateCol] < aptDate)].tail(10)\n",
    "        #has multiple appontments same day\n",
    "        multi=0\n",
    "        multi_df=df[(df[fileCol] == file1) & (df[dateCol] == aptDate)]\n",
    "        if multi_df.shape[0]>1:\n",
    "            multi=1\n",
    "        numOfRows = rslt_df.shape[0]\n",
    "\n",
    "\n",
    "        noShow10 = 0\n",
    "        noShow10Count=0\n",
    "        if numOfRows > 0:\n",
    "            rslt_df2 = rslt_df[(rslt_df[attendCol] == '0')]\n",
    "            noShow10Count= rslt_df2.shape[0]\n",
    "            noShow10 = noShow10Count / numOfRows\n",
    "\n",
    "        df.at[i, 'NoShow10'] = noShow10\n",
    "        df.at[i, 'NoShow10Count'] = noShow10Count\n",
    "\n",
    "        df.at[i, 'AttendCat3'] = attendCat(rslt_df,3, attendCol)\n",
    "        df.at[i, 'AttendCat5'] = attendCat(rslt_df,5, attendCol)\n",
    "        df.at[i, 'AttendCat7'] = attendCat(rslt_df,7, attendCol)\n",
    "        df.at[i, 'AttendCat10'] = attendCat(rslt_df,10, attendCol)\n",
    "        df.at[i, 'MultiAppointments'] = multi\n",
    "\n",
    "\n",
    "    \n",
    "    df['AttendCat3'] = df['AttendCat3'].apply(str)\n",
    "    df['AttendCat5'] = df['AttendCat5'].apply(str)\n",
    "    df['AttendCat7'] = df['AttendCat7'].apply(str)\n",
    "    df['AttendCat10'] = df['AttendCat10'].apply(str)\n",
    "    return df\n",
    "\n",
    "def attendCat(df,num, attendCol):\n",
    "    rslt_df = df.tail(num)\n",
    "    attendCat = \"'\"\n",
    "    if rslt_df.shape[0] == 0:\n",
    "        attendCat = \"'-1'\"\n",
    "        return attendCat\n",
    "    for j in rslt_df.index:\n",
    "        attendCat = str(attendCat + str(rslt_df[attendCol][j]))\n",
    "    return attendCat+\"'\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-82745b794e05>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mapptFile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Data3.csv'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mapptDF\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_DIR\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mapptFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m#1- delete unwanted rows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
     ],
     "ename": "NameError",
     "evalue": "name 'pd' is not defined",
     "output_type": "error"
    }
   ],
   "source": [
    "\n",
    "apptFile = 'Data3.csv'\n",
    "\n",
    "apptDF=pd.read_csv(INPUT_DIR+apptFile)\n",
    "\n",
    "#1- delete unwanted rows\n",
    "apptDF.drop(apptDF[apptDF['appointment Status']=='Emergency'].index, inplace = True)\n",
    "apptDF.drop(apptDF[apptDF['appointment Status']=='Walk-In'].index, inplace = True)\n",
    "apptDF.drop(apptDF[apptDF['appointment Status']=='Moved'].index, inplace = True)\n",
    "apptDF=apptDF[apptDF['Patient File ID'].notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "dfWithCancelled=fillAttend(apptDF)\n",
    "total=dfWithCancelled.shape[0]\n",
    "countAttend=dfWithCancelled[dfWithCancelled['Attend']=='1'].shape[0]\n",
    "print(\"Records with Cancelled:\")\n",
    "print('Total Records='+ str(total))\n",
    "print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n",
    "\n",
    "dfOutCancelled=apptDF.drop(apptDF[apptDF['appointment Status']=='Cancelled'].index, inplace = False)\n",
    "total=dfOutCancelled.shape[0]\n",
    "countAttend=dfOutCancelled[dfOutCancelled['Attend']=='1'].shape[0]\n",
    "print(\"Records without Cancelled:\")\n",
    "print('Total Records='+ str(total))\n",
    "print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#Calc noShow history\n",
    "df=dfWithCancelled\n",
    "\n",
    "# df=dfOutCancelled\n",
    "print('Total Records='+ str(df.shape[0]))\n",
    "print(\"Fixing date and time columns...\")\n",
    "\n",
    "#Fix date cols Col\n",
    "count=0\n",
    "for i in df.index:\n",
    "    count = count + 1\n",
    "    if count % 10000 == 0:\n",
    "        print(str(count) + \" record processed...\")\n",
    "    \n",
    "    if not pd.isnull(df[bdCol][i]):\n",
    "        dob=str(df[bdCol][i])\n",
    "        dateformat_dob='%d/%m/%Y'\n",
    "        datevalue=readBirthDate(dob,dateformat_dob) \n",
    "        df.at[i, bdCol] =datevalue\n",
    "        #df.at[i, 'org_dob'] = dob\n",
    "        \n",
    "\n",
    "    aptDate=str(df[dateCol][i])\n",
    "    dateformat_date='%d/%m/%Y'\n",
    "    datevalue = datetime.strptime(aptDate, dateformat_date).date()\n",
    "    df.at[i, dateCol] =datevalue\n",
    "    \n",
    "    \n",
    "        \n",
    "    bookDate=str(df[bookCol][i])\n",
    "    dateformat_date='%d/%m/%Y %H:%M'\n",
    "    datevalue = datetime.strptime(bookDate, dateformat_date).date()\n",
    "    df.at[i, bookCol] =datevalue\n",
    "    \n",
    "    \n",
    "    if not pd.isnull(df[timeCol][i]):\n",
    "        timeStr=str(df[timeCol][i])\n",
    "        tArr=timeStr.split(':')\n",
    "        hour=int(tArr[0])\n",
    "        assert hour < 36, \"Something is wrong here with time, should be less than 36 to work with AM:\" + timeStr\n",
    "        if  hour>=24:\n",
    "            if hour<25:\n",
    "                tArr[0] = str(hour - 12)\n",
    "            else:\n",
    "                tArr[0] = str(hour-24)\n",
    "            timeStr= tArr[0]+timeStr[2:len(timeStr)-2]+\"AM\"\n",
    "            df.at[i, timeCol] =timeStr\n",
    "\n",
    "\n",
    "df=fixDatatypes(df)\n",
    "print(df.dtypes)\n",
    "print(\"Calculating NoShow...\")\n",
    "df= calacNoShow(df, fileCol, dateCol, attendCol)\n",
    "df.to_csv(INPUT_DIR + 'Data3_NoShow_Hist.csv', index=None, header=True, encoding='utf-8-sig')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_NoShow_Hist.csv')\n",
    "aptDataFrame=readDatatypes(aptDataFrame)\n",
    "print(aptDataFrame.dtypes)\n",
    "aptDataFrame = aptDataFrame.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n",
    "prevApt = dict()\n",
    "numberOfVisits = dict()\n",
    "numberOfVisitsByDoctor = dict()\n",
    "  # read weather data and optimize by removing all old data\n",
    "print(\"Reading Weather Data...\")\n",
    "weatherDataFrame = pd.read_csv(INPUT_DIR + 'WeatherData.csv')\n",
    "weatherDataFrame['Date'] = pd.to_datetime(weatherDataFrame['Date'], format=date_format1)\n",
    "weatherDataFrame = weatherDataFrame[(weatherDataFrame['Date'] >= MIN_APT_DATE)]\n",
    "print('Start Processing Records...')\n",
    "count = 0\n",
    "invalidCount = 0\n",
    "futureCount = 0\n",
    "notWorkingHoursCount = 0\n",
    "\n",
    "for i in aptDataFrame.index:\n",
    "        count = count + 1\n",
    "        if count % 10000 == 0:\n",
    "            print(str(count) + \" record created...\")\n",
    "\n",
    "        birthDate = aptDataFrame[bdCol][i]\n",
    "        file = aptDataFrame['Patient File ID'][i]\n",
    "        aptDate = aptDataFrame['Appointment Date'][i]\n",
    "        insertDate = aptDataFrame['Booking Date and Time'][i]\n",
    "        doctor = aptDataFrame[doctorCol][i]\n",
    "        timeFrom = aptDataFrame['Appointment Time'][i]\n",
    "        attended = aptDataFrame['Attend'][i]\n",
    "       \n",
    "\n",
    "\n",
    "        \n",
    "       # if aptDate >= FUTURE_DATE:\n",
    "            #futureCount = futureCount + 1\n",
    "            #aptDataFrame.drop(index=i, axis=0, inplace=True)\n",
    "            #continue\n",
    "\n",
    "        deltaI = aptDate - insertDate\n",
    "        ddays= deltaI.days\n",
    "        if deltaI.days < 0:\n",
    "            print(\"Invalid days:\"+str(aptDate)+\" and \"+str(insertDate))\n",
    "            #aptDataFrame.drop(index=i, axis=0, inplace=True)\n",
    "            ddays=-1\n",
    "            invalidCount = invalidCount + 1\n",
    "            continue\n",
    "\n",
    "        ageAtAptDate=-999\n",
    "        if not pd.isnull(birthDate):\n",
    "            ageAtAptDate = relativedelta(aptDate, birthDate).years\n",
    "\n",
    "        daysSinceLastApt = 0\n",
    "        if file in prevApt:\n",
    "            deltaP = aptDate - prevApt[file]\n",
    "            daysSinceLastApt = deltaP.days\n",
    "        prevApt[file] = aptDate\n",
    "        #################################\n",
    "        prevVisits = 0\n",
    "        if file not in numberOfVisits:\n",
    "            numberOfVisits[file] = 0\n",
    "        prevVisits = numberOfVisits[file]\n",
    "        numberOfVisits[file] = numberOfVisits[file] + \\\n",
    "                               int(\n",
    "                                   attended)  # if the patient attended this appointment increase the number of visits by 1\n",
    "        #################################\n",
    "        prevVisitsWithThisDoctor = 0\n",
    "        patientDoc = str(file) + '-' + str(doctor)\n",
    "        if patientDoc not in numberOfVisitsByDoctor:\n",
    "            numberOfVisitsByDoctor[patientDoc] = 0\n",
    "        prevVisitsWithThisDoctor = numberOfVisitsByDoctor[patientDoc]\n",
    "        numberOfVisitsByDoctor[patientDoc] = numberOfVisitsByDoctor[patientDoc] + int(attended)\n",
    "        #################################\n",
    "        ##### Holiday is Ramadhan and 1st 10 days of Shawwal and 1st 20 days of Tholhejjah\n",
    "        aroundHoliday = isAroundHoliday(aptDate)\n",
    "        ramadhan = isRamadan(aptDate)\n",
    "        #################################\n",
    "        day = aptDate.strftime('%a')\n",
    "        #################################\n",
    "\n",
    "        ##### check date with working hours\n",
    "        #if not timeWithinWork(timeFrom, ramadhan):\n",
    "            #notWorkingHoursCount = notWorkingHoursCount + 1\n",
    "            #aptDataFrame.drop(index=i, axis=0, inplace=True)\n",
    "            #continue\n",
    "        ####################################\n",
    "        weather=\"\"\n",
    "        temperature=-999\n",
    "        timeh=-999\n",
    "        if not pd.isnull(timeFrom):\n",
    "            temperature = readWeatherData(weatherDataFrame, aptDate, timeFrom, \"feels\")\n",
    "            weather = readWeatherData(weatherDataFrame, aptDate, timeFrom, \"weather_main\")\n",
    "            timeh= timeFrom.time().hour\n",
    "\n",
    "        aptDataFrame.at[i, 'AgeAtAppointmentDate'] = ageAtAptDate\n",
    "        aptDataFrame.at[i, 'DaysSinceAptInsertion'] = ddays #difference between the reservation time and booking date\n",
    "        aptDataFrame.at[i, 'DaysSinceLastApt'] = daysSinceLastApt #days difference between the appointment and last one\n",
    "        aptDataFrame.at[i, 'AptDay'] = day\n",
    "        aptDataFrame.at[i, 'isWeekend'] = (day == 'Fri' or day == 'Sat')\n",
    "        aptDataFrame.at[i, 'Month'] = aptDate.strftime(\"%B\")\n",
    "        aptDataFrame.at[i, 'NumOfPrevVisits'] = prevVisits\n",
    "        aptDataFrame.at[i, 'NumOfPrevVisitsWithThisDoctor'] = prevVisitsWithThisDoctor\n",
    "        aptDataFrame.at[i, 'IsAroundHoliday'] = aroundHoliday\n",
    "        aptDataFrame.at[i, 'isRamadan'] = ramadhan\n",
    "        aptDataFrame.at[i, 'Temperature'] = temperature\n",
    "        aptDataFrame.at[i, 'Weather'] = weather\n",
    "        aptDataFrame.at[i, 'Hour'] =timeh\n",
    "\n",
    "print(\"Number of Invalid times=\" + repr(invalidCount))\n",
    "print(\"Number of Future Apts=\" + repr(futureCount))\n",
    "print(\"Number of Not Working Hours times=\" + str(notWorkingHoursCount))\n",
    "\n",
    "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Cleaned.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "## Yaz Correction\n",
    "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Cleaned.csv')\n",
    "aptDataFrame=readDatatypes(aptDataFrame)\n",
    "prevDate=\"\"\n",
    "allAbs=True\n",
    "count=0\n",
    "for i in aptDataFrame.index:\n",
    "        count = count + 1\n",
    "        if count % 10000 == 0:\n",
    "            print(str(count) + \" record created...\")\n",
    "        aptDate = aptDataFrame['Appointment Date'][i]\n",
    "        attended = aptDataFrame['Attend'][i]\n",
    "        if aptDate!=prevDate:\n",
    "            if allAbs==True:\n",
    "                print(prevDate)\n",
    "            allAbs=True\n",
    "        \n",
    "        if attended==\"0\"and allAbs:\n",
    "            allAbs=True\n",
    "        else:\n",
    "            allAbs=False\n",
    "        \n",
    "        prevDate=aptDate\n",
    "                \n",
    "                \n",
    "#2019-06-04 00:00:00\n",
    "#2019-06-05 00:00:00\n",
    "#2019-06-06 00:00:00\n",
    "#2019-06-08 00:00:00\n",
    "\n",
    "\n",
    "#2019-08-10 00:00:00\n",
    "#2019-08-11 00:00:00\n",
    "#2019-08-12 00:00:00\n",
    "#2019-08-13 00:00:00\n",
    "\n",
    "#2019-11-15 00:00:00\n",
    "        \n",
    "        \n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "##Recompute Noshows\n",
    "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Cleaned.csv')\n",
    "aptDataFrame=readDatatypes(aptDataFrame)\n",
    "aptDataFrame= calacNoShow(aptDataFrame, fileCol, dateCol, attendCol)\n",
    "aptDataFrame=handleSameDayAppts(aptDataFrame)\n",
    "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Corrected.csv', index=None, header=True, encoding='utf-8-sig')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Corrected.csv')\n",
    "aptDataFrame=readDatatypes(aptDataFrame)\n",
    "print(aptDataFrame.dtypes)\n",
    "total=aptDataFrame.shape[0]\n",
    "#drop missing DOB\n",
    "aptDataFrame=aptDataFrame[aptDataFrame[bdCol].notna()]\n",
    "aptDataFrame=aptDataFrame[aptDataFrame['DaysSinceAptInsertion'].notna()]\n",
    "#drop negative ages\n",
    "aptDataFrame=aptDataFrame.drop(aptDataFrame[aptDataFrame['AgeAtAppointmentDate']<0].index, inplace = False)\n",
    "\n",
    "# drop missing apt time\n",
    "aptDataFrame=aptDataFrame[aptDataFrame[timeCol].notna()]\n",
    "\n",
    "\n",
    "# drop 0 or below duration\n",
    "aptDataFrame=aptDataFrame.drop(aptDataFrame[aptDataFrame['appointment Duration']<=0].index, inplace = False)\n",
    "aptDataFrame=aptDataFrame[aptDataFrame['appointment Duration'].notna()]\n",
    "aptDataFrame.drop('NumOfPrevVisitsWithThisDoctor', axis=1, inplace=True)\n",
    "aptDataFrame.drop('NumOfPrevVisits', axis=1, inplace=True)\n",
    "after_del=aptDataFrame.shape[0]\n",
    "print(\"Deleted:\"+str(total-after_del))\n",
    "\n",
    "\n",
    "\n",
    "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Corrected2.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "## ANALYSIS\n",
    "\n",
    "df=pd.read_csv(INPUT_DIR+'Data3_Cleaned2.csv')\n",
    "df=readDatatypes(df)\n",
    "path=FIGS_DIR+'output5.html'\n",
    "generateReport(df,path)\n",
    "\n",
    "print(\"End...\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "## CAT EXPS\n",
    "print(\"Start...\")\n",
    "list1=['Markov','NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7','AttendCat10']\n",
    "#list1=[]\n",
    "for i in range(len(list1)+1):\n",
    "    copy = list1.copy()\n",
    "    if i== len(list1):\n",
    "        #print(\"Delete All\")\n",
    "        tag=\"DelAll\"\n",
    "    else:\n",
    "        notDel=copy.pop(i)\n",
    "        #print(\"Not Deleted:\"+notDel)\n",
    "        tag=\"NotDel\"+notDel\n",
    "\n",
    "    \n",
    "    \n",
    "    file1='Data3_Corrected2_Markov.csv'\n",
    "    #print(\"Start...\")\n",
    "    df=pd.read_csv(INPUT_DIR+file1)\n",
    "    df=readDatatypes(df)\n",
    "    df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n",
    "    #df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "    df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n",
    "\n",
    "    #df.drop(['Spciality','NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7','AttendCat10'], axis = 1, inplace=True)\n",
    "\n",
    "    df.drop(copy, axis = 1, inplace=True)\n",
    "\n",
    "\n",
    "    if 'SHOW OR NOSHOW' in df.columns:\n",
    "        df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n",
    "    if 'appointment Status' in df.columns:\n",
    "        df.drop('appointment Status', axis=1, inplace=True)\n",
    "    if 'New Patient Or Not' in df.columns:\n",
    "        df.drop('New Patient Or Not', axis=1, inplace=True)\n",
    "    #print(df.dtypes)\n",
    "    non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n",
    "    dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n",
    "    df = pd.get_dummies(df, columns=dummy_cols)\n",
    "    #print(\"Done\")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "    # Split to train test.. 90% <-> 10% (not shuffled)\n",
    "\n",
    "        #df=df.drop(df[df['appointment Duration']<60].index, inplace = False)\n",
    "       # df.drop('Spciality', axis=1, inplace=True)\n",
    "\n",
    "    X_train, X_test, y_train, y_test = train_test(df)\n",
    "    undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "    X_train, y_train = undersample.fit_resample(X_train, y_train)\n",
    "    \n",
    "   \n",
    "\n",
    "    \n",
    "    \n",
    "    \n",
    "\n",
    "    model='l'\n",
    "    name=model+\"_\"+\"CAT\"+\"_\"+tag\n",
    "    file='Train_'+name\n",
    "    total=len(y_train)+len(y_test)\n",
    "    grid_search=train(X_train,y_train, model)\n",
    "    print(\"#####Training#####%=\"+str(len(y_train)/total))\n",
    "    show_train=y_train[y_train==\"Show\"]\n",
    "    noshow_train=y_train[y_train==\"NoShow\"]\n",
    "    print(\"Show%=\"+str(len(show_train)/len(y_train)))\n",
    "    print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n",
    "    \n",
    "    \n",
    "    \n",
    "    testResults(grid_search,X_train,y_train, file)\n",
    "    \n",
    "\n",
    "    print(\"#####Testing#####%=\"+str(len(y_test)/total))\n",
    "    show_test=y_test[y_test==\"Show\"]\n",
    "    noshow_test=y_test[y_test==\"NoShow\"]\n",
    "    print(\"Show%=\"+str(len(show_test)/len(y_test)))\n",
    "    print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n",
    "  \n",
    "    \n",
    "    file='Test_'+name\n",
    "    \n",
    "    testResults(grid_search,X_test,y_test, file)\n",
    "\n",
    "print(\"Done ALL\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "##DUR EXPS\n",
    "print(\"Starting...\")\n",
    "durl=[15,30,40,45,50,60]\n",
    "file1='Data3_Corrected2.csv'\n",
    "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n",
    "\n",
    "#print(\"Start...\")\n",
    "df=pd.read_csv(INPUT_DIR+file1)\n",
    "df=readDatatypes(df)\n",
    "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n",
    "#df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n",
    "\n",
    "\n",
    "df.drop(list1, axis = 1, inplace=True)\n",
    "\n",
    "\n",
    "if 'SHOW OR NOSHOW' in df.columns:\n",
    "    df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n",
    "if 'appointment Status' in df.columns:\n",
    "    df.drop('appointment Status', axis=1, inplace=True)\n",
    "if 'New Patient Or Not' in df.columns:\n",
    "    df.drop('New Patient Or Not', axis=1, inplace=True)\n",
    "\n",
    "non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n",
    "dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n",
    "df = pd.get_dummies(df, columns=dummy_cols)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test(df)\n",
    "undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "X_train, y_train = undersample.fit_resample(X_train, y_train)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "model='r'\n",
    "tag1=''\n",
    "name=model+\"_\"+\"DUR\"+\"_\"+tag1\n",
    "file='Train_'+name\n",
    "total=len(y_train)+len(y_test)\n",
    "grid_search=train(X_train,y_train, model)\n",
    "print(\"#####Training#####%=\"+str(len(y_train)/total))\n",
    "show_train=y_train[y_train==\"Show\"]\n",
    "noshow_train=y_train[y_train==\"NoShow\"]\n",
    "print(\"Show%=\"+str(len(show_train)/len(y_train)))\n",
    "print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n",
    "\n",
    "\n",
    "\n",
    "testResults(grid_search,X_train,y_train, file)\n",
    "\n",
    "for d in durl:\n",
    "    X_train, X_test, y_train, y_test = train_test(df)\n",
    "    count=0\n",
    "    for i in X_test.index:\n",
    "        if (X_test['appointment Duration'][i] <= d):\n",
    "            X_test.drop(index=i, axis=0, inplace=True)\n",
    "            y_test=y_test.drop(i)\n",
    "        count=count+1\n",
    "        \n",
    "    print(\"#####Testing#####%=\"+str(len(y_test)/total))\n",
    "    show_test=y_test[y_test==\"Show\"]\n",
    "    noshow_test=y_test[y_test==\"NoShow\"]\n",
    "    print(\"Show%=\"+str(len(show_test)/len(y_test)))\n",
    "    print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n",
    "\n",
    "\n",
    "    file='Test_'+name+\"_\"+ str(d)\n",
    "    \n",
    "\n",
    "    testResults(grid_search,X_test,y_test, file)\n",
    "\n",
    "print(\"Done ALL\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "## Test Features\n",
    "print(\"Start...\")\n",
    "\n",
    "file1='Data3_Corrected2.csv'\n",
    "#list1=['SMS','Doctor ID','MultiAppointments','AgeAtAppointmentDate','DaysSinceAptInsertion','DaysSinceLastApt','AptDay','isWeekend','Month','IsAroundHoliday','isRamadan','Temperature','Weather','Hour']\n",
    "\n",
    "list1=['isWeekend','Month','IsAroundHoliday','isRamadan','Temperature','Weather','Hour']\n",
    "for i in range(len(list1)+1):\n",
    "    df=pd.read_csv(INPUT_DIR+file1)\n",
    "    df=readDatatypes(df)\n",
    "    if i== len(list1):\n",
    "        #print(\"Delete All\")\n",
    "        tag=\"keepAll\"\n",
    "    else:\n",
    "        dele=list1[i]\n",
    "        tag=\"Del\"+dele\n",
    "        df.drop(dele, axis = 1, inplace=True)\n",
    "\n",
    "    \n",
    "    \n",
    "\n",
    "    df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n",
    "    #df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "    df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n",
    "\n",
    "\n",
    "    \n",
    "\n",
    "\n",
    "    if 'SHOW OR NOSHOW' in df.columns:\n",
    "        df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n",
    "    if 'appointment Status' in df.columns:\n",
    "        df.drop('appointment Status', axis=1, inplace=True)\n",
    "    if 'New Patient Or Not' in df.columns:\n",
    "        df.drop('New Patient Or Not', axis=1, inplace=True)\n",
    "    #print(df.dtypes)\n",
    "    non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n",
    "    dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n",
    "    df = pd.get_dummies(df, columns=dummy_cols)\n",
    "  \n",
    "\n",
    "    X_train, X_test, y_train, y_test = train_test(df)\n",
    "    undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "    X_train, y_train = undersample.fit_resample(X_train, y_train)\n",
    "    \n",
    "   \n",
    "\n",
    "    \n",
    "    \n",
    "    \n",
    "\n",
    "    model='l'\n",
    "    name=model+\"_\"+\"Features\"+\"_\"+tag\n",
    "    file='Train_'+name\n",
    "    total=len(y_train)+len(y_test)\n",
    "    grid_search=train(X_train,y_train, model)\n",
    "    print(\"#####Training#####%=\"+str(len(y_train)/total))\n",
    "    show_train=y_train[y_train==\"Show\"]\n",
    "    noshow_train=y_train[y_train==\"NoShow\"]\n",
    "    print(\"Show%=\"+str(len(show_train)/len(y_train)))\n",
    "    print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n",
    "    \n",
    "    \n",
    "    \n",
    "    testResults(grid_search,X_train,y_train, file)\n",
    "    \n",
    "\n",
    "    print(\"#####Testing#####%=\"+str(len(y_test)/total))\n",
    "    show_test=y_test[y_test==\"Show\"]\n",
    "    noshow_test=y_test[y_test==\"NoShow\"]\n",
    "    print(\"Show%=\"+str(len(show_test)/len(y_test)))\n",
    "    print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n",
    "  \n",
    "    \n",
    "    file='Test_'+name\n",
    "    \n",
    "    testResults(grid_search,X_test,y_test, file)\n",
    "\n",
    "print(\"Done ALL\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#df['clinic']=df['Spciality'].apply(f)\n",
    "#df['clinic'] = df['clinic'].replace('', np.nan)\n",
    "#df=df[df['clinic'].notna()]\n",
    "\n",
    "#print(df.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "#for i in set(df.clinic.values):\n",
    "    #print('Processing: '+ str(i))\n",
    "    #df2 = df[df['clinic']==i]\n",
    "    #X_train, X_test, y_train, y_test = train_test(df2)\n",
    "    #undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "    #X_test, y_test = undersample.fit_resample(X_test, y_test)\n",
    "    #if len(y_test.unique())>1:\n",
    "        #testResults(grid_search,X_test,y_test)\n",
    "    #else:\n",
    "        #print(\"One Class:\",y_test.head(1))\n",
    "#print(\"DONE!\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#df['clinic'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "##DUR EXPS\n",
    "print(\"Starting...\")\n",
    "durl=[15,30,40,45,50,60]\n",
    "file1='Data3_Corrected2.csv'\n",
    "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n",
    "\n",
    "#print(\"Start...\")\n",
    "df=pd.read_csv(INPUT_DIR+file1)\n",
    "df=readDatatypes(df)\n",
    "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n",
    "\n",
    "\n",
    "for d in durl:\n",
    "    print(d)\n",
    "    df2 = df[df['appointment Duration'] >= d]\n",
    "    df3=df2[df2[attendCol]==\"NoShow\"]\n",
    "    print(str(df3.shape[0]/df2.shape[0]))\n",
    "    \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "###Markov\n",
    "print(\"Start...\")\n",
    "file1='Data3_Corrected2.csv'\n",
    "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n",
    "\n",
    "#print(\"Start...\")\n",
    "df=pd.read_csv(INPUT_DIR+file1)\n",
    "df=readDatatypes(df)\n",
    "calacNoShowMarkov(df, fileCol, dateCol, attendCol)\n",
    "df.to_csv(INPUT_DIR + 'Data3_Corrected2_Markov.csv', index=None, header=True, encoding='utf-8-sig')\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#### Figs\n",
    "\n",
    "\n",
    "print(\"Start...\")\n",
    "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n",
    "\n",
    "\n",
    "file1='Data3_Corrected2.csv'\n",
    "df=pd.read_csv(INPUT_DIR+file1)\n",
    "df=readDatatypes(df)\n",
    "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n",
    "df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n",
    "df.drop(list1, axis = 1, inplace=True)\n",
    "\n",
    "\n",
    "if 'SHOW OR NOSHOW' in df.columns:\n",
    "    df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n",
    "if 'appointment Status' in df.columns:\n",
    "    df.drop('appointment Status', axis=1, inplace=True)\n",
    "if 'New Patient Or Not' in df.columns:\n",
    "    df.drop('New Patient Or Not', axis=1, inplace=True)\n",
    "non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n",
    "dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n",
    "df = pd.get_dummies(df, columns=dummy_cols)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test(df)\n",
    "undersample = RandomUnderSampler(sampling_strategy='majority')\n",
    "X_train, y_train = undersample.fit_resample(X_train, y_train)\n",
    "\n",
    "\n",
    "\n",
    "models=['Logistic Regression','Random Forests','Gradient Boosting']\n",
    "result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])\n",
    "for model in models:\n",
    "    print(\"Doing \"+ model)\n",
    "    m=train(X_train,y_train, model[0]).best_estimator_\n",
    "    yproba = m.predict_proba(X_test)[:1]\n",
    "    fpr, tpr, _ = roc_curve(y_test,  yproba, pos_label=posLabel)\n",
    "    auc = roc_auc_score(y_test, yproba,  labels=labels)\n",
    "    result_table = result_table.append({'classifiers':model,\n",
    "                                        'fpr':fpr, \n",
    "                                        'tpr':tpr, \n",
    "                                        'auc':auc}, ignore_index=True)\n",
    "\n",
    "\n",
    "    \n",
    "    \n",
    "result_table.set_index('classifiers', inplace=True)\n",
    "\n",
    "fig = plt.figure(figsize=(8,6))\n",
    "\n",
    "for i in result_table.index:\n",
    "    plt.plot(result_table.loc[i]['fpr'], \n",
    "             result_table.loc[i]['tpr'], \n",
    "             label=\"{}, AUC={:.3f}\".format(i, result_table.loc[i]['auc']))\n",
    "    \n",
    "plt.plot([0,1], [0,1], color='orange', linestyle='--')\n",
    "\n",
    "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.ylabel(\"True Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)\n",
    "plt.legend(prop={'size':13}, loc='lower right')\n",
    "\n",
    "plt.show()\n",
    "\n",
    "\n",
    "    \n",
    "    \n",
    "    \n",
    "\n",
    "print(\"Done ALL\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "result_table['classifiers']=['LR','RF', 'GB']\n",
    "result_table.set_index('classifiers', inplace=True)\n",
    "lines=['--',':','-.']\n",
    "aucs=[0.717, 0.718, 0.712]\n",
    "\n",
    "fig = plt.figure(figsize=(8,6))\n",
    "count=0\n",
    "for i in result_table.index:\n",
    "    print(lines[count])\n",
    "    \n",
    "    plt.plot(result_table.loc[i]['fpr'],\n",
    "             result_table.loc[i]['tpr'], linestyle=lines[count], drawstyle='steps',\n",
    "             label=\"{}, AUC={:.3f}\".format(i, aucs[count]))\n",
    "    count=count+1\n",
    "    \n",
    "plt.plot([0,1], [0,1], color='black', linestyle='-')\n",
    "\n",
    "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.ylabel(\"True Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.title('ROC Curve', fontweight='bold', fontsize=15)\n",
    "plt.legend(prop={'size':13}, loc='lower right')\n",
    "plt.savefig(INPUT_DIR+\"res.svg\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "result_table['classifiers']=['LR','RF', 'GB']\n",
    "result_table.set_index('classifiers', inplace=True)\n",
    "lines=['--',':','-.']\n",
    "aucs=[0.717, 0.718, 0.712]\n",
    "\n",
    "fig = plt.figure(figsize=(8,6))\n",
    "count=0\n",
    "for i in result_table.index:\n",
    "    print(lines[count])\n",
    "    \n",
    "    plt.plot(result_table.loc[i]['fpr'],\n",
    "             result_table.loc[i]['tpr'], linestyle=lines[count], drawstyle='steps',\n",
    "             label=\"{}, AUC={:.3f}\".format(i, aucs[count]))\n",
    "    count=count+1\n",
    "    \n",
    "plt.plot([0,1], [0,1], color='black', linestyle='-')\n",
    "\n",
    "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n",
    "plt.ylabel(\"True Positive Rate\", fontsize=15)\n",
    "\n",
    "plt.title('ROC Curve', fontweight='bold', fontsize=15)\n",
    "plt.legend(prop={'size':13}, loc='lower right')\n",
    "plt.savefig(INPUT_DIR+\"res.svg\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PyCharm (NoShowData3)",
   "language": "python",
   "name": "pycharm-6a655149"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}