{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "source": [ "#### import pandas as pd\n", "import numpy as np\n", "import pandas_profiling\n", "from ummalqura.hijri_date import HijriDate\n", "from dateutil.relativedelta import relativedelta\n", "from datetime import date, datetime\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from xgboost import XGBClassifier\n", "from pprint import pprint\n", "import numpy as np\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.metrics import f1_score\n", "from sklearn.metrics import make_scorer\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import precision_recall_curve\n", "import matplotlib\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import f1_score\n", "import matplotlib.pyplot as plt\n", "import researchpy as rp\n", "from scipy import stats\n", "from sklearn.metrics import matthews_corrcoef, roc_curve, roc_auc_score\n", "from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.ensemble import RandomForestRegressor\n", "\n", "from sklearn.metrics import r2_score\n", "from rfpimp import *\n", "MIN_APT_DATE = '2019-01-01'\n", "date_format1 = '%d/%m/%Y'\n", "date_format2 = '%d/%m/%y'\n", "time_format = '%I:%M %p'\n", "INPUT_DIR = 'data/'\n", "RES_DIR= 'results/'\n", "STORED_DATE_FORMAT = '%Y-%m-%d'\n", "STORED_TIME_FORMAT = '%H:%M:%S'\n", "dateCol = 'Appointment Date'\n", "timeCol='Appointment Time'\n", "fileCol = 'Patient File ID'\n", "attendCol = 'Attend'\n", "doctorCol='Doctor ID'\n", "bdCol='Date of birth'\n", "bookCol='Booking Date and Time'\n", "posLabel=\"NoShow\"\n", "labels=[\"NoShow\",\"Show\"]\n", "time_format = '%I:%M %p' \n", "CSV_DT_FORMAT='%Y-%m-%d %H:%M:%S'\n", "FIGS_DIR = 'figs/'\n", "desired_width = 20000\n", "pd.set_option('display.width', desired_width)\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.max_columns', 50)\n", "pd.options.display.float_format = '{:,.2f}'.format\n", "\n", "def f(s):\n", " numeric_filter = filter(str.isdigit, s)\n", " numeric_string = \"\".join(numeric_filter)\n", " return numeric_string\n", "\n", "\n", "\n", "\n", "def train(X_train,y_train, selection):\n", " #print(\"Start Training...\")\n", " f1_scorer = make_scorer(f1_score, pos_label=posLabel)\n", " score=\"f1\"\n", "##XGB\n", " xgb_search_grid= {\n", " 'eta': [0.1],\n", " 'n_estimators': [128],\n", " 'min_child_weight':[6],\n", " 'gamma':[0.3],\n", " 'colsample_bytree':[0.8],\n", " 'scale_pos_weight':[1],\n", " 'max_depth': [10],\n", " 'subsample': [0.8]}\n", " xgbc=XGBClassifier()\n", " \n", "##GB\n", " gb_search_grid = {'loss':['deviance'],\n", " 'learning_rate': [1],\n", " 'n_estimators': [500],\n", " 'max_depth': [2],\n", " 'min_samples_split': [2],\n", " 'min_samples_leaf': [1],\n", " 'max_features': ['auto'],\n", " 'subsample': [0.9]}\n", " gbc = GradientBoostingClassifier()\n", "\n", "##LG\n", " lg_search_grid={'penalty': ['l2'],'C':[0.001], 'max_iter':[500]}\n", " lgc=LogisticRegression()\n", "##RF\n", "\n", " rf_search_grid = {'n_estimators': [100],\n", " 'max_features': ['auto'],\n", " 'max_depth': [10],\n", " 'min_samples_split': [70],\n", " 'min_samples_leaf': [70],\n", " 'bootstrap': [True]}\n", "\n", " rfc = RandomForestClassifier()\n", "#######SELECTION##########\n", " if selection.lower()=='l':\n", " print(\"Logistic Regression is selected\")\n", " selected_grid=lg_search_grid\n", " selected_model=lgc\n", " elif selection.lower()=='g':\n", " print(\"Gradient Boosting is selected\")\n", " selected_grid=gb_search_grid\n", " selected_model=gbc\n", " elif selection.lower()=='r':\n", " print(\"Random Forest is selected\")\n", " selected_grid=rf_search_grid\n", " selected_model=rfc\n", " else:\n", " exit(0)\n", " \n", "#######SELECTION##########\n", " \n", " grid_search = GridSearchCV(estimator=selected_model, param_grid=selected_grid, cv=2, verbose=2,\n", " n_jobs=-3, scoring=f1_scorer)\n", "\n", " grid_search.fit(X_train, y_train)\n", " \n", " \n", " \n", " return grid_search\n", " \n", "def testResults(grid_search,X_test,y_test, file):\n", " #print(\"Classes:\")\n", " #print(grid_search.classes_)\n", " #pprint(grid_search.best_params_)\n", " #print(grid_search.best_score_)\n", " best_grid = grid_search.best_estimator_\n", "\n", " \n", " #MM\n", " prob = best_grid.predict_proba(X_test)\n", " y_predicted_positive = list(zip(*prob))[0] #NoSow\n", " y_predicted_negative = list(zip(*prob))[1] # Show\n", "\n", "\n", " precision, recall, thresholds = precision_recall_curve(y_test, y_predicted_positive, pos_label=posLabel)\n", "\n", "\n", "\n", " f1_scores = 2*recall*precision/(0.00001+(recall+precision))\n", " #print('Best threshold: ', thresholds[np.argmax(f1_scores)])\n", " f1=np.max(f1_scores)\n", " auc=roc_auc_score(y_test,y_predicted_negative, labels=labels)\n", " thresholds = np.append(thresholds, 1)\n", " \n", " \n", " print (\"Results for:\"+file)\n", " \n", " print(\"AUC=\",auc)\n", " print(\"F1=\",f1)\n", " \n", " df=pd.DataFrame(\n", " {'precision': precision,\n", " 'recall': recall,\n", " 'thresholds': thresholds\n", " })\n", " \n", " df.to_csv(RES_DIR + file+'.csv', index=None, header=True, encoding='utf-8-sig')\n", " \n", " return f1,auc,precision,recall,thresholds\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", "def processDuration(independents, dependent):\n", " print(\"Count before=\",len(dependent) )\n", " print(\"Start processing duration...\")\n", " count=0\n", " for i in independents.index:\n", " count = count + 1\n", " if count % 1000 == 0:\n", " print(str(count) + \" record processed...\")\n", " dur = independents['appointment Duration'][i]\n", " if dur < 60:\n", " independents.drop(index=i, axis=0, inplace=True)\n", " dependent.drop(index=i, axis=0, inplace=True)\n", " \n", " print(\"Count After=\",len(dependent) )\n", " if len(dependent) != independents.shape[0]:\n", " print(\"Error: Dependent and Independent not matching\")\n", " return independents, dependent\n", "\n", "\n", "\n", "\n", "def train_test(df):\n", " # Get the Dependent and Independent Features.\n", " X = df.drop(['Attend'], axis=1)\n", " y = df['Attend']\n", "\n", " # Split into 90% train and 10% test\n", " return train_test_split(X, y, test_size=0.1, shuffle=False)\n", "\n", "def handleSameDayAppts(df):\n", " \n", " df = df.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n", " count=0\n", " filePrev=\"\"\n", " aptDatePrev=0\n", " docPrev=\"\"\n", " fixed=0\n", " prevIndex=-1\n", " for i in df.index:\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record processed...\")\n", " \n", " file1 = df[fileCol][i]\n", " aptDate = df[dateCol][i]\n", " doc = df[doctorCol][i]\n", "\n", " \n", " if filePrev== file1 and aptDatePrev==aptDate:\n", " fixed=fixed+1\n", " # sanity check\n", " if df['MultiAppointments'][i] !=True:\n", " print(\"Multi=\"+str(df['MultiAppointments'][i])+\" Error: expecting multiple appointments for file \"+str(file1) + \" on \"+str(aptDate))\n", " df.at[i, 'NoShow10'] = df.at[prevIndex, 'NoShow10']\n", " df.at[i, 'NoShow10Count'] = df.at[prevIndex, 'NoShow10Count']\n", " df.at[i, 'NoShowAll'] = df.at[prevIndex, 'NoShowAll']\n", " df.at[i, 'NoShowAllCount'] = df.at[prevIndex, 'NoShowAllCount']\n", " \n", " \n", " \n", " df.at[i, 'AttendCat3'] = df.at[prevIndex, 'AttendCat3']\n", " df.at[i, 'AttendCat5'] = df.at[prevIndex, 'AttendCat5']\n", " df.at[i, 'AttendCat7'] = df.at[prevIndex, 'AttendCat7']\n", " df.at[i, 'AttendCat10'] = df.at[prevIndex, 'AttendCat10']\n", " \n", " \n", " df.at[i, 'DaysSinceLastApt'] = df.at[prevIndex, 'DaysSinceLastApt']\n", " df.at[i, 'NumOfPrevVisits'] = df.at[prevIndex, 'NumOfPrevVisits']\n", " if doc==docPrev:\n", " df.at[i, 'NumOfPrevVisitsWithThisDoctor'] = df.at[prevIndex, 'NumOfPrevVisitsWithThisDoctor']\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " doctorPrev=doc\n", " filePrev = file1\n", " aptDatePrev = aptDate\n", " prevIndex=i\n", " \n", " print(\"Fixed=\"+str(fixed))\n", " return df\n", "\n", "def crosstab(df,featuresToPlot):\n", " for i in aptDataFrame.columns:\n", " print(i)\n", " table, results = rp.crosstab(aptDataFrame[i], aptDataFrame['Attend'], prop='col',\n", " test='chi-square',\n", " correction=False)\n", " print(results)\n", " \n", "def generateReport(df, path):\n", " prof=pandas_profiling.ProfileReport(df)\n", " print(\"Save to file..\")\n", " prof.to_file(output_file=path)\n", " print(\"Save to file complete\")\n", " rejected_features = list(prof.get_rejected_variables())\n", " print(rejected_features)\n", " \n", "def readDatatypes(df):\n", " if attendCol in df.columns:\n", " df[attendCol] = df[attendCol].apply(str) \n", " if timeCol in df.columns:\n", " df[timeCol] = pd.to_datetime(df[timeCol], format=CSV_DT_FORMAT)\n", " if bdCol in df.columns:\n", " df[bdCol] = pd.to_datetime(df[bdCol], format=CSV_DT_FORMAT)\n", " if bookCol in df.columns:\n", " df[bookCol] = pd.to_datetime(df[bookCol], format=CSV_DT_FORMAT)\n", " if dateCol in df.columns:\n", " df[dateCol] = pd.to_datetime(df[dateCol], format=CSV_DT_FORMAT)\n", " if doctorCol in df.columns:\n", " df[doctorCol] = df[doctorCol].apply(str) \n", " \n", " if 'AttendCat3' in df.columns:\n", " df['AttendCat3'] = df['AttendCat3'].apply(str)\n", " if 'AttendCat5' in df.columns:\n", " df['AttendCat5'] = df['AttendCat5'].apply(str)\n", " if 'AttendCat7' in df.columns:\n", " df['AttendCat7'] = df['AttendCat7'].apply(str)\n", " if 'AttendCat10' in df.columns:\n", " df['AttendCat10'] = df['AttendCat10'].apply(str)\n", " \n", "\n", " \n", " if 'MultiAppointments' in df.columns:\n", " df['MultiAppointments'] = df['MultiAppointments'].apply(bool)\n", " if 'Hour' in df.columns:\n", " df['Hour'] = df['Hour'].apply(str)\n", "\n", " \n", "\n", " \n", " return df\n", "\n", "def fixDatatypes(df):\n", " \n", " df[timeCol] = pd.to_datetime(df[timeCol], format=time_format)\n", " df[bdCol] = pd.to_datetime(df[bdCol], format=STORED_DATE_FORMAT)\n", " df[bookCol] = pd.to_datetime(df[bookCol], format=STORED_DATE_FORMAT)\n", " df[dateCol] = pd.to_datetime(df[dateCol], format=STORED_DATE_FORMAT)\n", " df[doctorCol] = df[doctorCol].apply(str)\n", " return df\n", "\n", "def convertDateTimeCols(df,dateCols, timeCols,dateformat,timeformat):\n", " for i in dateCols:\n", " df[i] = pd.to_datetime(df[i], format=dateformat)\n", "\n", " for i in timeCols:\n", " df[i] = pd.to_datetime(df[i], format=timeformat)\n", " return df\n", "\n", "def timeWithinWork(time, ramadan):\n", " workStart=datetime.strptime(\"1900-01-01 09:00:00\", time_format) #9\n", " workEnd = datetime.strptime(\"12:00 AM\", time_format) #0\n", " if ramadan == True:\n", " workStart=datetime.strptime(\"12:00 PM\", time_format) #12\n", " workEnd = datetime.strptime(\"3:00 AM\", time_format) #3\n", "\n", " if time >workEnd and time < workStart:\n", " # print(\"Ramadan=\"+str(ramadan)+\" t=\"+str(time)+\" is out of range\")\n", " return False\n", " return True\n", "\n", "\n", "\n", "def readWeatherData(weatherDataFrame, aptDate, aptTime, colName):\n", "\n", " weatherData = weatherDataFrame[(weatherDataFrame['Date'] == aptDate)]\n", " dateFound= True\n", " if weatherData.shape[0] != 1:\n", " dateFound= False\n", " weatherData = weatherData[(weatherData['Hour'] == aptTime.time().hour)]\n", " if weatherData.shape[0] > 1:\n", " print(str(aptDate) + \":\" + str(aptTime))\n", " print(\"Error:\" + str(weatherData.shape[0]))\n", " print(weatherData)\n", " return float('nan')\n", " elif weatherData.shape[0] == 0:\n", " ## Future or very old\n", " print(aptDate)\n", " print(aptTime.hour)\n", " print('Date Found:'+ str(dateFound))\n", " exit(0)\n", " return float('nan')\n", " else:\n", " return weatherData.iloc[0][colName]\n", "\n", "\n", "def isAroundHoliday(date):\n", " um = HijriDate(date.year, date.month, date.day, gr=True)\n", " if um.month == 9:\n", " return True\n", " elif um.month == 10 and um.day <= 10:\n", " return True\n", " elif um.month == 12 and um.day <= 20:\n", " return True\n", " return False\n", "\n", "\n", "def isRamadan(date):\n", " um = HijriDate(date.year, date.month, date.day, gr=True)\n", " if um.month == 9:\n", " return True\n", " \n", " return False\n", "\n", "def readBirthDate(datestr,date_format):\n", " try:\n", " datevalue = datetime.strptime(dob, dateformat_dob).date()\n", " except ValueError:\n", " arr = dob.split('/')\n", " year = int(arr[len(arr) - 1])\n", " datevalue = datetime.strptime('1/1/'+str(year), dateformat_dob).date()\n", " \n", " \n", " #check Hijri\n", " arr = datevalue.strftime(dateformat_dob).split('/')\n", " year = int(arr[len(arr) - 1])\n", " if 1500 > year > 1300:\n", " yearg=round(year*0.97+622)\n", " datevalue = datetime.strptime('1/1/'+str(yearg), dateformat_dob).date()\n", " if datevalue.year>2022:\n", " print(\"Error: Input date is \"+datestr+ \" output is \"+str(datevalue))\n", " exit()\n", " return datevalue\n", "\n", "def fillAttend(df):\n", " total=0\n", " countAttend=0\n", "\n", " for i in apptDF.index:\n", " total = total + 1\n", " attend=\"0\"\n", " if apptDF['SHOW OR NOSHOW'][i]=='SHOW':\n", " attend=\"1\"\n", " countAttend=countAttend+1\n", " df.at[i, 'Attend'] = attend\n", " #print('Total Records='+ str(total))\n", " #print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n", " return df\n", "\n", "def calacNoShowMarkov(df, fileCol, dateCol, attendCol):\n", " df[attendCol] = df[attendCol].astype(str)\n", " count = 0\n", " X_train, X_test, y_train, y_test = train_test(df)\n", " undersample = RandomUnderSampler(sampling_strategy='majority')\n", " X_train, y_train = undersample.fit_resample(X_train, y_train)\n", " X_train[attendCol] = y_train\n", " df2= X_train\n", " \n", " history = dict()\n", " for i in df2.index:\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record processed...\")\n", " seq=df2.at[i, 'AttendCat10']\n", " seq=seq.strip()\n", " attend=df2.at[i, attendCol]\n", " if seq==\"'-1'\":\n", " seq=\"'\"+attend+\"'\"\n", " else:\n", " seq=seq[:-1] + attend + \"'\"\n", " \n", " \n", " if seq in history:\n", " c1=history[seq]\n", " history[seq]=c1+1\n", " else:\n", " history[seq]=1\n", " #print(\"History for:\"+ seq+\"=\",history[seq])\n", " \n", " count = 0\n", " print(\"Adding Markov probs\")\n", " \n", " for i in df.index:\n", " countn=0\n", " counts=0\n", " prob=0\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record processed...\")\n", " \n", " seq=df.at[i, 'AttendCat10']\n", " seq=seq.strip()\n", " attend=df.at[i, attendCol]\n", " \n", " \n", " if seq!=\"'-1'\":\n", " seqn=seq[:-1] + \"0'\"\n", " seqs=seq[:-1] + \"1'\"\n", " counts=0\n", " countn=0\n", " if seqn in history:\n", " #print(\"Found\")\n", " countn=history[seqn]\n", " if seqs in history:\n", " #print(\"Found\")\n", " counts=history[seqs]\n", " total= countn + counts\n", "\n", " if total!= 0:\n", " prob= countn/total\n", "\n", "\n", " df.at[i, 'Markov']=prob\n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "def calacNoShow(df, fileCol, dateCol, attendCol):\n", " df[attendCol] = df[attendCol].astype(str)\n", " noShowCount = dict()\n", " allCount = dict()\n", " df = df.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n", " count = 0\n", " for i in df.index:\n", " count = count + 1\n", "\n", " if count % 1000 == 0:\n", " print(str(count) + \" record processed...\")\n", " \n", " \n", " \n", " file1 = df[fileCol][i]\n", " aptDate = df[dateCol][i]\n", " noShow = 0\n", " allCount1 = 0\n", " noShowCount1 = 0\n", "\n", " if file1 in allCount:\n", " noShow = noShowCount[file1] / allCount[file1]\n", " allCount1 = allCount[file1]\n", " noShowCount1 = noShowCount[file1]\n", " attend = df[attendCol][i]\n", " allCount1 = allCount1 + 1\n", " df.at[i, 'NoShowAllCount'] = noShowCount1\n", " if attend == \"0\":\n", " noShowCount1 = noShowCount1 + 1\n", " allCount[file1] = allCount1\n", " noShowCount[file1] = noShowCount1\n", " df.at[i, 'NoShowAll'] = noShow\n", "\n", " # %% This code calculate NoShow\n", "\n", " rslt_df = df[(df[fileCol] == file1) & (df[dateCol] < aptDate)].tail(10)\n", " #has multiple appontments same day\n", " multi=0\n", " multi_df=df[(df[fileCol] == file1) & (df[dateCol] == aptDate)]\n", " if multi_df.shape[0]>1:\n", " multi=1\n", " numOfRows = rslt_df.shape[0]\n", "\n", "\n", " noShow10 = 0\n", " noShow10Count=0\n", " if numOfRows > 0:\n", " rslt_df2 = rslt_df[(rslt_df[attendCol] == '0')]\n", " noShow10Count= rslt_df2.shape[0]\n", " noShow10 = noShow10Count / numOfRows\n", "\n", " df.at[i, 'NoShow10'] = noShow10\n", " df.at[i, 'NoShow10Count'] = noShow10Count\n", "\n", " df.at[i, 'AttendCat3'] = attendCat(rslt_df,3, attendCol)\n", " df.at[i, 'AttendCat5'] = attendCat(rslt_df,5, attendCol)\n", " df.at[i, 'AttendCat7'] = attendCat(rslt_df,7, attendCol)\n", " df.at[i, 'AttendCat10'] = attendCat(rslt_df,10, attendCol)\n", " df.at[i, 'MultiAppointments'] = multi\n", "\n", "\n", " \n", " df['AttendCat3'] = df['AttendCat3'].apply(str)\n", " df['AttendCat5'] = df['AttendCat5'].apply(str)\n", " df['AttendCat7'] = df['AttendCat7'].apply(str)\n", " df['AttendCat10'] = df['AttendCat10'].apply(str)\n", " return df\n", "\n", "def attendCat(df,num, attendCol):\n", " rslt_df = df.tail(num)\n", " attendCat = \"'\"\n", " if rslt_df.shape[0] == 0:\n", " attendCat = \"'-1'\"\n", " return attendCat\n", " for j in rslt_df.index:\n", " attendCat = str(attendCat + str(rslt_df[attendCol][j]))\n", " return attendCat+\"'\"" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [ { "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mapptFile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Data3.csv'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mapptDF\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_DIR\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mapptFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m#1- delete unwanted rows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" ], "ename": "NameError", "evalue": "name 'pd' is not defined", "output_type": "error" } ], "source": [ "\n", "apptFile = 'Data3.csv'\n", "\n", "apptDF=pd.read_csv(INPUT_DIR+apptFile)\n", "\n", "#1- delete unwanted rows\n", "apptDF.drop(apptDF[apptDF['appointment Status']=='Emergency'].index, inplace = True)\n", "apptDF.drop(apptDF[apptDF['appointment Status']=='Walk-In'].index, inplace = True)\n", "apptDF.drop(apptDF[apptDF['appointment Status']=='Moved'].index, inplace = True)\n", "apptDF=apptDF[apptDF['Patient File ID'].notna()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "dfWithCancelled=fillAttend(apptDF)\n", "total=dfWithCancelled.shape[0]\n", "countAttend=dfWithCancelled[dfWithCancelled['Attend']=='1'].shape[0]\n", "print(\"Records with Cancelled:\")\n", "print('Total Records='+ str(total))\n", "print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n", "\n", "dfOutCancelled=apptDF.drop(apptDF[apptDF['appointment Status']=='Cancelled'].index, inplace = False)\n", "total=dfOutCancelled.shape[0]\n", "countAttend=dfOutCancelled[dfOutCancelled['Attend']=='1'].shape[0]\n", "print(\"Records without Cancelled:\")\n", "print('Total Records='+ str(total))\n", "print('Attend Records='+ str(countAttend) + \",\" + str((countAttend/total)*100))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "#Calc noShow history\n", "df=dfWithCancelled\n", "\n", "# df=dfOutCancelled\n", "print('Total Records='+ str(df.shape[0]))\n", "print(\"Fixing date and time columns...\")\n", "\n", "#Fix date cols Col\n", "count=0\n", "for i in df.index:\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record processed...\")\n", " \n", " if not pd.isnull(df[bdCol][i]):\n", " dob=str(df[bdCol][i])\n", " dateformat_dob='%d/%m/%Y'\n", " datevalue=readBirthDate(dob,dateformat_dob) \n", " df.at[i, bdCol] =datevalue\n", " #df.at[i, 'org_dob'] = dob\n", " \n", "\n", " aptDate=str(df[dateCol][i])\n", " dateformat_date='%d/%m/%Y'\n", " datevalue = datetime.strptime(aptDate, dateformat_date).date()\n", " df.at[i, dateCol] =datevalue\n", " \n", " \n", " \n", " bookDate=str(df[bookCol][i])\n", " dateformat_date='%d/%m/%Y %H:%M'\n", " datevalue = datetime.strptime(bookDate, dateformat_date).date()\n", " df.at[i, bookCol] =datevalue\n", " \n", " \n", " if not pd.isnull(df[timeCol][i]):\n", " timeStr=str(df[timeCol][i])\n", " tArr=timeStr.split(':')\n", " hour=int(tArr[0])\n", " assert hour < 36, \"Something is wrong here with time, should be less than 36 to work with AM:\" + timeStr\n", " if hour>=24:\n", " if hour<25:\n", " tArr[0] = str(hour - 12)\n", " else:\n", " tArr[0] = str(hour-24)\n", " timeStr= tArr[0]+timeStr[2:len(timeStr)-2]+\"AM\"\n", " df.at[i, timeCol] =timeStr\n", "\n", "\n", "df=fixDatatypes(df)\n", "print(df.dtypes)\n", "print(\"Calculating NoShow...\")\n", "df= calacNoShow(df, fileCol, dateCol, attendCol)\n", "df.to_csv(INPUT_DIR + 'Data3_NoShow_Hist.csv', index=None, header=True, encoding='utf-8-sig')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_NoShow_Hist.csv')\n", "aptDataFrame=readDatatypes(aptDataFrame)\n", "print(aptDataFrame.dtypes)\n", "aptDataFrame = aptDataFrame.sort_values(by=[dateCol,fileCol,doctorCol], ascending =True)\n", "prevApt = dict()\n", "numberOfVisits = dict()\n", "numberOfVisitsByDoctor = dict()\n", " # read weather data and optimize by removing all old data\n", "print(\"Reading Weather Data...\")\n", "weatherDataFrame = pd.read_csv(INPUT_DIR + 'WeatherData.csv')\n", "weatherDataFrame['Date'] = pd.to_datetime(weatherDataFrame['Date'], format=date_format1)\n", "weatherDataFrame = weatherDataFrame[(weatherDataFrame['Date'] >= MIN_APT_DATE)]\n", "print('Start Processing Records...')\n", "count = 0\n", "invalidCount = 0\n", "futureCount = 0\n", "notWorkingHoursCount = 0\n", "\n", "for i in aptDataFrame.index:\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record created...\")\n", "\n", " birthDate = aptDataFrame[bdCol][i]\n", " file = aptDataFrame['Patient File ID'][i]\n", " aptDate = aptDataFrame['Appointment Date'][i]\n", " insertDate = aptDataFrame['Booking Date and Time'][i]\n", " doctor = aptDataFrame[doctorCol][i]\n", " timeFrom = aptDataFrame['Appointment Time'][i]\n", " attended = aptDataFrame['Attend'][i]\n", " \n", "\n", "\n", " \n", " # if aptDate >= FUTURE_DATE:\n", " #futureCount = futureCount + 1\n", " #aptDataFrame.drop(index=i, axis=0, inplace=True)\n", " #continue\n", "\n", " deltaI = aptDate - insertDate\n", " ddays= deltaI.days\n", " if deltaI.days < 0:\n", " print(\"Invalid days:\"+str(aptDate)+\" and \"+str(insertDate))\n", " #aptDataFrame.drop(index=i, axis=0, inplace=True)\n", " ddays=-1\n", " invalidCount = invalidCount + 1\n", " continue\n", "\n", " ageAtAptDate=-999\n", " if not pd.isnull(birthDate):\n", " ageAtAptDate = relativedelta(aptDate, birthDate).years\n", "\n", " daysSinceLastApt = 0\n", " if file in prevApt:\n", " deltaP = aptDate - prevApt[file]\n", " daysSinceLastApt = deltaP.days\n", " prevApt[file] = aptDate\n", " #################################\n", " prevVisits = 0\n", " if file not in numberOfVisits:\n", " numberOfVisits[file] = 0\n", " prevVisits = numberOfVisits[file]\n", " numberOfVisits[file] = numberOfVisits[file] + \\\n", " int(\n", " attended) # if the patient attended this appointment increase the number of visits by 1\n", " #################################\n", " prevVisitsWithThisDoctor = 0\n", " patientDoc = str(file) + '-' + str(doctor)\n", " if patientDoc not in numberOfVisitsByDoctor:\n", " numberOfVisitsByDoctor[patientDoc] = 0\n", " prevVisitsWithThisDoctor = numberOfVisitsByDoctor[patientDoc]\n", " numberOfVisitsByDoctor[patientDoc] = numberOfVisitsByDoctor[patientDoc] + int(attended)\n", " #################################\n", " ##### Holiday is Ramadhan and 1st 10 days of Shawwal and 1st 20 days of Tholhejjah\n", " aroundHoliday = isAroundHoliday(aptDate)\n", " ramadhan = isRamadan(aptDate)\n", " #################################\n", " day = aptDate.strftime('%a')\n", " #################################\n", "\n", " ##### check date with working hours\n", " #if not timeWithinWork(timeFrom, ramadhan):\n", " #notWorkingHoursCount = notWorkingHoursCount + 1\n", " #aptDataFrame.drop(index=i, axis=0, inplace=True)\n", " #continue\n", " ####################################\n", " weather=\"\"\n", " temperature=-999\n", " timeh=-999\n", " if not pd.isnull(timeFrom):\n", " temperature = readWeatherData(weatherDataFrame, aptDate, timeFrom, \"feels\")\n", " weather = readWeatherData(weatherDataFrame, aptDate, timeFrom, \"weather_main\")\n", " timeh= timeFrom.time().hour\n", "\n", " aptDataFrame.at[i, 'AgeAtAppointmentDate'] = ageAtAptDate\n", " aptDataFrame.at[i, 'DaysSinceAptInsertion'] = ddays #difference between the reservation time and booking date\n", " aptDataFrame.at[i, 'DaysSinceLastApt'] = daysSinceLastApt #days difference between the appointment and last one\n", " aptDataFrame.at[i, 'AptDay'] = day\n", " aptDataFrame.at[i, 'isWeekend'] = (day == 'Fri' or day == 'Sat')\n", " aptDataFrame.at[i, 'Month'] = aptDate.strftime(\"%B\")\n", " aptDataFrame.at[i, 'NumOfPrevVisits'] = prevVisits\n", " aptDataFrame.at[i, 'NumOfPrevVisitsWithThisDoctor'] = prevVisitsWithThisDoctor\n", " aptDataFrame.at[i, 'IsAroundHoliday'] = aroundHoliday\n", " aptDataFrame.at[i, 'isRamadan'] = ramadhan\n", " aptDataFrame.at[i, 'Temperature'] = temperature\n", " aptDataFrame.at[i, 'Weather'] = weather\n", " aptDataFrame.at[i, 'Hour'] =timeh\n", "\n", "print(\"Number of Invalid times=\" + repr(invalidCount))\n", "print(\"Number of Future Apts=\" + repr(futureCount))\n", "print(\"Number of Not Working Hours times=\" + str(notWorkingHoursCount))\n", "\n", "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Cleaned.csv', index=None, header=True, encoding='utf-8-sig')\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "## Yaz Correction\n", "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Cleaned.csv')\n", "aptDataFrame=readDatatypes(aptDataFrame)\n", "prevDate=\"\"\n", "allAbs=True\n", "count=0\n", "for i in aptDataFrame.index:\n", " count = count + 1\n", " if count % 10000 == 0:\n", " print(str(count) + \" record created...\")\n", " aptDate = aptDataFrame['Appointment Date'][i]\n", " attended = aptDataFrame['Attend'][i]\n", " if aptDate!=prevDate:\n", " if allAbs==True:\n", " print(prevDate)\n", " allAbs=True\n", " \n", " if attended==\"0\"and allAbs:\n", " allAbs=True\n", " else:\n", " allAbs=False\n", " \n", " prevDate=aptDate\n", " \n", " \n", "#2019-06-04 00:00:00\n", "#2019-06-05 00:00:00\n", "#2019-06-06 00:00:00\n", "#2019-06-08 00:00:00\n", "\n", "\n", "#2019-08-10 00:00:00\n", "#2019-08-11 00:00:00\n", "#2019-08-12 00:00:00\n", "#2019-08-13 00:00:00\n", "\n", "#2019-11-15 00:00:00\n", " \n", " \n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "##Recompute Noshows\n", "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Cleaned.csv')\n", "aptDataFrame=readDatatypes(aptDataFrame)\n", "aptDataFrame= calacNoShow(aptDataFrame, fileCol, dateCol, attendCol)\n", "aptDataFrame=handleSameDayAppts(aptDataFrame)\n", "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Corrected.csv', index=None, header=True, encoding='utf-8-sig')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "aptDataFrame=pd.read_csv(INPUT_DIR+'Data3_Corrected.csv')\n", "aptDataFrame=readDatatypes(aptDataFrame)\n", "print(aptDataFrame.dtypes)\n", "total=aptDataFrame.shape[0]\n", "#drop missing DOB\n", "aptDataFrame=aptDataFrame[aptDataFrame[bdCol].notna()]\n", "aptDataFrame=aptDataFrame[aptDataFrame['DaysSinceAptInsertion'].notna()]\n", "#drop negative ages\n", "aptDataFrame=aptDataFrame.drop(aptDataFrame[aptDataFrame['AgeAtAppointmentDate']<0].index, inplace = False)\n", "\n", "# drop missing apt time\n", "aptDataFrame=aptDataFrame[aptDataFrame[timeCol].notna()]\n", "\n", "\n", "# drop 0 or below duration\n", "aptDataFrame=aptDataFrame.drop(aptDataFrame[aptDataFrame['appointment Duration']<=0].index, inplace = False)\n", "aptDataFrame=aptDataFrame[aptDataFrame['appointment Duration'].notna()]\n", "aptDataFrame.drop('NumOfPrevVisitsWithThisDoctor', axis=1, inplace=True)\n", "aptDataFrame.drop('NumOfPrevVisits', axis=1, inplace=True)\n", "after_del=aptDataFrame.shape[0]\n", "print(\"Deleted:\"+str(total-after_del))\n", "\n", "\n", "\n", "aptDataFrame.to_csv(INPUT_DIR + 'Data3_Corrected2.csv', index=None, header=True, encoding='utf-8-sig')\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "## ANALYSIS\n", "\n", "df=pd.read_csv(INPUT_DIR+'Data3_Cleaned2.csv')\n", "df=readDatatypes(df)\n", "path=FIGS_DIR+'output5.html'\n", "generateReport(df,path)\n", "\n", "print(\"End...\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "## CAT EXPS\n", "print(\"Start...\")\n", "list1=['Markov','NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7','AttendCat10']\n", "#list1=[]\n", "for i in range(len(list1)+1):\n", " copy = list1.copy()\n", " if i== len(list1):\n", " #print(\"Delete All\")\n", " tag=\"DelAll\"\n", " else:\n", " notDel=copy.pop(i)\n", " #print(\"Not Deleted:\"+notDel)\n", " tag=\"NotDel\"+notDel\n", "\n", " \n", " \n", " file1='Data3_Corrected2_Markov.csv'\n", " #print(\"Start...\")\n", " df=pd.read_csv(INPUT_DIR+file1)\n", " df=readDatatypes(df)\n", " df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n", " #df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n", " df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n", "\n", " #df.drop(['Spciality','NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7','AttendCat10'], axis = 1, inplace=True)\n", "\n", " df.drop(copy, axis = 1, inplace=True)\n", "\n", "\n", " if 'SHOW OR NOSHOW' in df.columns:\n", " df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n", " if 'appointment Status' in df.columns:\n", " df.drop('appointment Status', axis=1, inplace=True)\n", " if 'New Patient Or Not' in df.columns:\n", " df.drop('New Patient Or Not', axis=1, inplace=True)\n", " #print(df.dtypes)\n", " non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n", " dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n", " df = pd.get_dummies(df, columns=dummy_cols)\n", " #print(\"Done\")\n", "\n", "\n", "\n", "\n", " # Split to train test.. 90% <-> 10% (not shuffled)\n", "\n", " #df=df.drop(df[df['appointment Duration']<60].index, inplace = False)\n", " # df.drop('Spciality', axis=1, inplace=True)\n", "\n", " X_train, X_test, y_train, y_test = train_test(df)\n", " undersample = RandomUnderSampler(sampling_strategy='majority')\n", " X_train, y_train = undersample.fit_resample(X_train, y_train)\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " model='l'\n", " name=model+\"_\"+\"CAT\"+\"_\"+tag\n", " file='Train_'+name\n", " total=len(y_train)+len(y_test)\n", " grid_search=train(X_train,y_train, model)\n", " print(\"#####Training#####%=\"+str(len(y_train)/total))\n", " show_train=y_train[y_train==\"Show\"]\n", " noshow_train=y_train[y_train==\"NoShow\"]\n", " print(\"Show%=\"+str(len(show_train)/len(y_train)))\n", " print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n", " \n", " \n", " \n", " testResults(grid_search,X_train,y_train, file)\n", " \n", "\n", " print(\"#####Testing#####%=\"+str(len(y_test)/total))\n", " show_test=y_test[y_test==\"Show\"]\n", " noshow_test=y_test[y_test==\"NoShow\"]\n", " print(\"Show%=\"+str(len(show_test)/len(y_test)))\n", " print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n", " \n", " \n", " file='Test_'+name\n", " \n", " testResults(grid_search,X_test,y_test, file)\n", "\n", "print(\"Done ALL\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "\n", "##DUR EXPS\n", "print(\"Starting...\")\n", "durl=[15,30,40,45,50,60]\n", "file1='Data3_Corrected2.csv'\n", "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n", "\n", "#print(\"Start...\")\n", "df=pd.read_csv(INPUT_DIR+file1)\n", "df=readDatatypes(df)\n", "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n", "#df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n", "df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n", "\n", "\n", "df.drop(list1, axis = 1, inplace=True)\n", "\n", "\n", "if 'SHOW OR NOSHOW' in df.columns:\n", " df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n", "if 'appointment Status' in df.columns:\n", " df.drop('appointment Status', axis=1, inplace=True)\n", "if 'New Patient Or Not' in df.columns:\n", " df.drop('New Patient Or Not', axis=1, inplace=True)\n", "\n", "non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n", "dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n", "df = pd.get_dummies(df, columns=dummy_cols)\n", "\n", "\n", "\n", "\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test(df)\n", "undersample = RandomUnderSampler(sampling_strategy='majority')\n", "X_train, y_train = undersample.fit_resample(X_train, y_train)\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "model='r'\n", "tag1=''\n", "name=model+\"_\"+\"DUR\"+\"_\"+tag1\n", "file='Train_'+name\n", "total=len(y_train)+len(y_test)\n", "grid_search=train(X_train,y_train, model)\n", "print(\"#####Training#####%=\"+str(len(y_train)/total))\n", "show_train=y_train[y_train==\"Show\"]\n", "noshow_train=y_train[y_train==\"NoShow\"]\n", "print(\"Show%=\"+str(len(show_train)/len(y_train)))\n", "print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n", "\n", "\n", "\n", "testResults(grid_search,X_train,y_train, file)\n", "\n", "for d in durl:\n", " X_train, X_test, y_train, y_test = train_test(df)\n", " count=0\n", " for i in X_test.index:\n", " if (X_test['appointment Duration'][i] <= d):\n", " X_test.drop(index=i, axis=0, inplace=True)\n", " y_test=y_test.drop(i)\n", " count=count+1\n", " \n", " print(\"#####Testing#####%=\"+str(len(y_test)/total))\n", " show_test=y_test[y_test==\"Show\"]\n", " noshow_test=y_test[y_test==\"NoShow\"]\n", " print(\"Show%=\"+str(len(show_test)/len(y_test)))\n", " print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n", "\n", "\n", " file='Test_'+name+\"_\"+ str(d)\n", " \n", "\n", " testResults(grid_search,X_test,y_test, file)\n", "\n", "print(\"Done ALL\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "## Test Features\n", "print(\"Start...\")\n", "\n", "file1='Data3_Corrected2.csv'\n", "#list1=['SMS','Doctor ID','MultiAppointments','AgeAtAppointmentDate','DaysSinceAptInsertion','DaysSinceLastApt','AptDay','isWeekend','Month','IsAroundHoliday','isRamadan','Temperature','Weather','Hour']\n", "\n", "list1=['isWeekend','Month','IsAroundHoliday','isRamadan','Temperature','Weather','Hour']\n", "for i in range(len(list1)+1):\n", " df=pd.read_csv(INPUT_DIR+file1)\n", " df=readDatatypes(df)\n", " if i== len(list1):\n", " #print(\"Delete All\")\n", " tag=\"keepAll\"\n", " else:\n", " dele=list1[i]\n", " tag=\"Del\"+dele\n", " df.drop(dele, axis = 1, inplace=True)\n", "\n", " \n", " \n", "\n", " df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n", " #df.to_csv(INPUT_DIR + 'xxx.csv', index=None, header=True, encoding='utf-8-sig')\n", " df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n", "\n", "\n", " \n", "\n", "\n", " if 'SHOW OR NOSHOW' in df.columns:\n", " df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n", " if 'appointment Status' in df.columns:\n", " df.drop('appointment Status', axis=1, inplace=True)\n", " if 'New Patient Or Not' in df.columns:\n", " df.drop('New Patient Or Not', axis=1, inplace=True)\n", " #print(df.dtypes)\n", " non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n", " dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n", " df = pd.get_dummies(df, columns=dummy_cols)\n", " \n", "\n", " X_train, X_test, y_train, y_test = train_test(df)\n", " undersample = RandomUnderSampler(sampling_strategy='majority')\n", " X_train, y_train = undersample.fit_resample(X_train, y_train)\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " model='l'\n", " name=model+\"_\"+\"Features\"+\"_\"+tag\n", " file='Train_'+name\n", " total=len(y_train)+len(y_test)\n", " grid_search=train(X_train,y_train, model)\n", " print(\"#####Training#####%=\"+str(len(y_train)/total))\n", " show_train=y_train[y_train==\"Show\"]\n", " noshow_train=y_train[y_train==\"NoShow\"]\n", " print(\"Show%=\"+str(len(show_train)/len(y_train)))\n", " print(\"NoShow%=\"+str(len(noshow_train)/len(y_train)))\n", " \n", " \n", " \n", " testResults(grid_search,X_train,y_train, file)\n", " \n", "\n", " print(\"#####Testing#####%=\"+str(len(y_test)/total))\n", " show_test=y_test[y_test==\"Show\"]\n", " noshow_test=y_test[y_test==\"NoShow\"]\n", " print(\"Show%=\"+str(len(show_test)/len(y_test)))\n", " print(\"NoShow%=\"+str(len(noshow_test)/len(y_test)))\n", " \n", " \n", " file='Test_'+name\n", " \n", " testResults(grid_search,X_test,y_test, file)\n", "\n", "print(\"Done ALL\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "#df['clinic']=df['Spciality'].apply(f)\n", "#df['clinic'] = df['clinic'].replace('', np.nan)\n", "#df=df[df['clinic'].notna()]\n", "\n", "#print(df.shape[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "\n", "#for i in set(df.clinic.values):\n", " #print('Processing: '+ str(i))\n", " #df2 = df[df['clinic']==i]\n", " #X_train, X_test, y_train, y_test = train_test(df2)\n", " #undersample = RandomUnderSampler(sampling_strategy='majority')\n", " #X_test, y_test = undersample.fit_resample(X_test, y_test)\n", " #if len(y_test.unique())>1:\n", " #testResults(grid_search,X_test,y_test)\n", " #else:\n", " #print(\"One Class:\",y_test.head(1))\n", "#print(\"DONE!\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "#df['clinic'].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "\n", "##DUR EXPS\n", "print(\"Starting...\")\n", "durl=[15,30,40,45,50,60]\n", "file1='Data3_Corrected2.csv'\n", "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n", "\n", "#print(\"Start...\")\n", "df=pd.read_csv(INPUT_DIR+file1)\n", "df=readDatatypes(df)\n", "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n", "\n", "\n", "for d in durl:\n", " print(d)\n", " df2 = df[df['appointment Duration'] >= d]\n", " df3=df2[df2[attendCol]==\"NoShow\"]\n", " print(str(df3.shape[0]/df2.shape[0]))\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "###Markov\n", "print(\"Start...\")\n", "file1='Data3_Corrected2.csv'\n", "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n", "\n", "#print(\"Start...\")\n", "df=pd.read_csv(INPUT_DIR+file1)\n", "df=readDatatypes(df)\n", "calacNoShowMarkov(df, fileCol, dateCol, attendCol)\n", "df.to_csv(INPUT_DIR + 'Data3_Corrected2_Markov.csv', index=None, header=True, encoding='utf-8-sig')\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "#### Figs\n", "\n", "\n", "print(\"Start...\")\n", "list1=['NoShowAllCount','NoShowAll','NoShow10','NoShow10Count','AttendCat3','AttendCat5','AttendCat7']\n", "\n", "\n", "file1='Data3_Corrected2.csv'\n", "df=pd.read_csv(INPUT_DIR+file1)\n", "df=readDatatypes(df)\n", "df[attendCol] = df[attendCol].apply(lambda x: \"Show\" if x == \"1\" else \"NoShow\")\n", "df.drop([fileCol,'Spciality','Date of birth','Nationality','Appointment Date','Appointment Time','Booking Date and Time'], axis = 1, inplace=True)\n", "df.drop(list1, axis = 1, inplace=True)\n", "\n", "\n", "if 'SHOW OR NOSHOW' in df.columns:\n", " df.drop('SHOW OR NOSHOW', axis=1, inplace=True)\n", "if 'appointment Status' in df.columns:\n", " df.drop('appointment Status', axis=1, inplace=True)\n", "if 'New Patient Or Not' in df.columns:\n", " df.drop('New Patient Or Not', axis=1, inplace=True)\n", "non_dummy_cols = ['Spciality', fileCol,attendCol,'DaysSinceAptInsertion', 'AgeAtAppointmentDate', 'DaysSinceLastApt', 'NumOfPrevVisitsWithThisDoctor', 'NumOfPrevVisits', 'NoShowAll', 'NoShow10', 'appointment Duration', 'Temperature', 'NoShowAllCount', 'NoShow10Count'] \n", "dummy_cols = list(set(df.columns) - set(non_dummy_cols))\n", "df = pd.get_dummies(df, columns=dummy_cols)\n", "\n", "X_train, X_test, y_train, y_test = train_test(df)\n", "undersample = RandomUnderSampler(sampling_strategy='majority')\n", "X_train, y_train = undersample.fit_resample(X_train, y_train)\n", "\n", "\n", "\n", "models=['Logistic Regression','Random Forests','Gradient Boosting']\n", "result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])\n", "for model in models:\n", " print(\"Doing \"+ model)\n", " m=train(X_train,y_train, model[0]).best_estimator_\n", " yproba = m.predict_proba(X_test)[:1]\n", " fpr, tpr, _ = roc_curve(y_test, yproba, pos_label=posLabel)\n", " auc = roc_auc_score(y_test, yproba, labels=labels)\n", " result_table = result_table.append({'classifiers':model,\n", " 'fpr':fpr, \n", " 'tpr':tpr, \n", " 'auc':auc}, ignore_index=True)\n", "\n", "\n", " \n", " \n", "result_table.set_index('classifiers', inplace=True)\n", "\n", "fig = plt.figure(figsize=(8,6))\n", "\n", "for i in result_table.index:\n", " plt.plot(result_table.loc[i]['fpr'], \n", " result_table.loc[i]['tpr'], \n", " label=\"{}, AUC={:.3f}\".format(i, result_table.loc[i]['auc']))\n", " \n", "plt.plot([0,1], [0,1], color='orange', linestyle='--')\n", "\n", "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n", "\n", "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.ylabel(\"True Positive Rate\", fontsize=15)\n", "\n", "plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)\n", "plt.legend(prop={'size':13}, loc='lower right')\n", "\n", "plt.show()\n", "\n", "\n", " \n", " \n", " \n", "\n", "print(\"Done ALL\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "result_table['classifiers']=['LR','RF', 'GB']\n", "result_table.set_index('classifiers', inplace=True)\n", "lines=['--',':','-.']\n", "aucs=[0.717, 0.718, 0.712]\n", "\n", "fig = plt.figure(figsize=(8,6))\n", "count=0\n", "for i in result_table.index:\n", " print(lines[count])\n", " \n", " plt.plot(result_table.loc[i]['fpr'],\n", " result_table.loc[i]['tpr'], linestyle=lines[count], drawstyle='steps',\n", " label=\"{}, AUC={:.3f}\".format(i, aucs[count]))\n", " count=count+1\n", " \n", "plt.plot([0,1], [0,1], color='black', linestyle='-')\n", "\n", "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n", "\n", "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.ylabel(\"True Positive Rate\", fontsize=15)\n", "\n", "plt.title('ROC Curve', fontweight='bold', fontsize=15)\n", "plt.legend(prop={'size':13}, loc='lower right')\n", "plt.savefig(INPUT_DIR+\"res.svg\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "result_table['classifiers']=['LR','RF', 'GB']\n", "result_table.set_index('classifiers', inplace=True)\n", "lines=['--',':','-.']\n", "aucs=[0.717, 0.718, 0.712]\n", "\n", "fig = plt.figure(figsize=(8,6))\n", "count=0\n", "for i in result_table.index:\n", " print(lines[count])\n", " \n", " plt.plot(result_table.loc[i]['fpr'],\n", " result_table.loc[i]['tpr'], linestyle=lines[count], drawstyle='steps',\n", " label=\"{}, AUC={:.3f}\".format(i, aucs[count]))\n", " count=count+1\n", " \n", "plt.plot([0,1], [0,1], color='black', linestyle='-')\n", "\n", "plt.xticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.xlabel(\"Flase Positive Rate\", fontsize=15)\n", "\n", "plt.yticks(np.arange(0.0, 1.1, step=0.1))\n", "plt.ylabel(\"True Positive Rate\", fontsize=15)\n", "\n", "plt.title('ROC Curve', fontweight='bold', fontsize=15)\n", "plt.legend(prop={'size':13}, loc='lower right')\n", "plt.savefig(INPUT_DIR+\"res.svg\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PyCharm (NoShowData3)", "language": "python", "name": "pycharm-6a655149" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 1 }