{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyNfsymNa+gbvac/XPXmDpYt"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","metadata":{"id":"8DR6GQKBTmRN"},"source":["import numpy as np\n","import pandas as pd"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"N6bB-DjBV6kb"},"source":["from google.colab import files\n","\n","uploaded = files.upload()\n","\n","for fn in uploaded.keys():\n"," print('User uploaded file \"{name}\" with length {length} bytes'.format(\n"," name=fn, length=len(uploaded[fn])))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"6mCIIeAVEt6c"},"source":["import io\n","df = pd.read_csv(io.StringIO(uploaded['fake_job_postings.csv'].decode('utf-8')))\n","df.head()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"AxIHuihVEwCb"},"source":["import numpy as np \n","from scipy.stats import pearsonr\n","from sklearn.preprocessing import LabelEncoder\n","from sklearn.decomposition import PCA\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.metrics import classification_report\n","#from sklearn.metrics import accuracy_score\n","import seaborn as sns;sns.set(style=\"ticks\", color_codes=True)\n","import matplotlib.pyplot as plt\n","import plotly.graph_objs as go\n","from plotly.subplots import make_subplots\n","import seaborn as sns\n","import cufflinks as cf\n","import plotly.express as px\n","%matplotlib inline\n","import warnings\n","warnings.filterwarnings('ignore')\n","from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n","init_notebook_mode(connected = True)\n","cf.go_offline() \n","\n","#from sklearn.model_selection import train_test_split\n","from sklearn import metrics\n","from sklearn.metrics import mean_squared_error"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["**Transforming into numerical**"],"metadata":{"id":"BDS6hM3WJpyY"}},{"cell_type":"code","metadata":{"id":"2Q55Q4apFKjo"},"source":["from sklearn.preprocessing import LabelEncoder\n","lbencder=LabelEncoder()\n","def lencoder_feature(x_labels):\n"," lbencder=LabelEncoder()\n"," x_labels=lbencder.fit_transform(x_labels)\n"," return x_labels\n","df=df.apply(lencoder_feature)\n","df.head()"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.info()"],"metadata":{"id":"A1LsDDdm1vmh"},"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"QLLGCxS-FdDK"},"source":["feature=df.drop(columns=['fraudulent'])\n","label=df['fraudulent']"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"wuP1a3ZVFuFi"},"source":["from scipy.stats import pearsonr\n","\n","corre=pd.DataFrame()\n","\n","for i in feature.columns:\n"," corre[i]= pearsonr(label, feature[i])\n"," \n"," \n","corre"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"HzGjwaThFw92"},"source":["corre1=corre.T\n","corre1\n","corre1"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Yk9vX_hsF1b6"},"source":["corre1=corre.T\n","corre1\n","coore2= corre1.iloc[:,0].sort_values(ascending=False)\n","coore2\n","coore2= corre1.iloc[:,0].sort_values(ascending=False)\n","coore2"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"B5c25MGdGJ15"},"source":["Features Selection(Reducing Dimension)"]},{"cell_type":"code","metadata":{"id":"BwlMP9pBF-CK"},"source":["feature=df.drop(columns=['fraudulent'])\n","label=df['fraudulent']\n","\n","from sklearn.decomposition import PCA\n","from sklearn.metrics import accuracy_score\n","Transf_pca = PCA(n_components= 13)\n","datanew = Transf_pca.fit_transform(feature)\n","print(datanew)\n","datanew.shape"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["datanew"],"metadata":{"id":"hmSgwGf4Lggr"},"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"R5V1N_cHGT47"},"source":["label=pd.DataFrame(label)\n","label.shape\n","newbase=np.concatenate((datanew,label),axis=1)\n","print(newbase)\n","newbase.shape\n","newbase=pd.DataFrame(newbase)\n","print(newbase.columns)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"EBLPYfimAQRx"},"source":["newbase # new dataset with only 10 independent variables after reduction"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"artkvr_ZGili"},"source":["separating fake and real jobs and do a prediction using OneClass SVM for fake jobs"]},{"cell_type":"code","metadata":{"id":"s2LTe3CqGYP0"},"source":["dataset_selected1=newbase.loc[newbase[13].isin([1])]\n","dataset_selected0=newbase.loc[newbase[13].isin([0])]\n","\n","\n","#on va diviser dataset_selected1 en 2 partie data1 et label1\n","label1=dataset_selected1[13]\n","data1=dataset_selected1.drop([13],axis=1)\n","\n","#on va diviser dataset_selected0 en 2 partie data0 et label0\n","label0=dataset_selected0[13]\n","#remplacer 0 par -1\n","label0=label0-1\n","data0=dataset_selected0.drop([13],axis=1)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"iy99OBEYGpkO"},"source":["from sklearn.model_selection import train_test_split\n","x_train1,x_test1,y_train1,y_test1=train_test_split(data1,label1,test_size=0.33,random_state=0)\n","from sklearn import svm\n","model=svm.OneClassSVM(kernel='rbf',nu=1,gamma=0.00001)\n","import time\n","debut=time.time()\n","model.fit(x_train1)\n","fin=time.time()-debut"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"6paUqiB-GtFo"},"source":["import numpy as np\n","data_tesst=np.concatenate((x_test1,data0),axis=0)\n","label_tesst=np.concatenate((y_test1,label0),axis=0)\n","pred=model.predict(data_tesst)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"J3OHuGYtGws6"},"source":["from sklearn.metrics import accuracy_score\n","ACC=accuracy_score(label_tesst,pred)*100\n","print('the accurency score is :')\n","print(ACC)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"EEr_ygSiG0bi"},"source":["from sklearn.metrics import classification_report\n","print(classification_report(label_tesst,pred))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GzOR1bIRG9Rp"},"source":["predict fake jobs after using PCA to reduce dimension"]},{"cell_type":"code","metadata":{"id":"zDmV44rkG4do"},"source":["print(newbase)\n","label=newbase[13]\n","data=newbase.drop([13],axis=1)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"bldNFzhkHDAR"},"source":["label.value_counts()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"rVumTlLNHXbR"},"source":["Using RandomOversampling to deal with imabalanced data"]},{"cell_type":"code","metadata":{"id":"vgNC6zp2HOlv"},"source":["from imblearn.over_sampling import RandomOverSampler\n","import imblearn\n","print(imblearn.__version__)\n","\n","# define oversampling strategy\n","oversample = RandomOverSampler(sampling_strategy='minority')\n","\n","# define oversampling strategy\n","oversample = RandomOverSampler(sampling_strategy=1.0)\n","\n","# fit and apply the transform\n","X_over, y_over = oversample.fit_resample(feature, label)\n","#compter combien de 1 et de 0 dans dataset\n","label=y_over\n","data=X_over\n","label =pd.DataFrame(label)\n","label.value_counts()"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data.info()"],"metadata":{"id":"8WGvL3PN6GAM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#data = data.drop(['salary_range', 'department' , 'required_education' , 'benefits'], axis=1)"],"metadata":{"id":"_8eMnk7e91XE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data"],"metadata":{"id":"0fXXbPPq-HEZ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.model_selection import train_test_split\n","x_train,x_test,y_train,y_test = train_test_split(data,label,test_size = 0.3,random_state = 0)"],"metadata":{"id":"oWddCjLDBwWU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["x_test"],"metadata":{"id":"NQJrgoj7Cjkc"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["**Feature Scaling**"],"metadata":{"id":"ztiqT-tsB7gt"}},{"cell_type":"code","source":["from sklearn.preprocessing import StandardScaler\n","sc_X = StandardScaler()\n","x_train = sc_X.fit_transform(x_train)\n","x_test = sc_X.transform(x_test)"],"metadata":{"id":"FYSVgQntB64Y"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["x_test"],"metadata":{"id":"H9ow3e8YCUbf"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"R0rm6dzgHi3L"},"source":["Applying Decision Tree Classifier to predict fakeor real posting jobs"]},{"cell_type":"code","metadata":{"id":"vmafkGrvHcwN"},"source":["from sklearn.tree import DecisionTreeClassifier\n","dtree = DecisionTreeClassifier()\n","import time\n","debut=time.time()\n","dtree.fit(x_train,y_train)\n","fin=time.time()-debut\n","prediction = dtree.predict(x_test)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"D7onxILHHn_Q"},"source":["from sklearn.metrics import accuracy_score\n","ACC=accuracy_score(y_test,prediction)*100\n","print('With decision tree accuracy is: ',ACC) # accuracy"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"uQxMgm-75Tn4"},"source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,prediction)\n","print(cm)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"eo1TYRygrEln"},"source":["# **AdaBoost Classifier**"]},{"cell_type":"code","metadata":{"id":"x68Tdz32YRWn"},"source":["from sklearn.ensemble import AdaBoostClassifier\n","abc = AdaBoostClassifier(n_estimators=1600,\n"," random_state=0)\n","model = abc.fit(x_train, y_train)\n","\n","y_pred = model.predict(x_test)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"oGRFt5udrQXw"},"source":["Accuracy (AB)"]},{"cell_type":"code","metadata":{"id":"EBdnZr7KY5ep"},"source":["from sklearn.metrics import accuracy_score\n","ACC=accuracy_score(y_test,y_pred)*100\n","print('With AdaBoost accuracy is: ',ACC) # accuracy"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Kappa Value"],"metadata":{"id":"o2vcC1xZninA"}},{"cell_type":"code","source":["from sklearn.metrics import cohen_kappa_score\n","\n","cohen_kappa_score(y_test,y_pred)"],"metadata":{"id":"N17m_tKQnk8h"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"NvLO1glUtHA7"},"source":["**Confusion Matrix**"]},{"cell_type":"code","metadata":{"id":"Ni_hwjRktM13"},"source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,y_pred)\n","print(cm)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"H8UCQT5PrWfi"},"source":["**Performance/Calssification_Report**"]},{"cell_type":"code","metadata":{"id":"sZX66QVarce3"},"source":["from sklearn.metrics import classification_report\n","print(\"Classification Report: \\n\", classification_report(y_test,y_pred))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"VYlVrk6htjVU"},"source":["**Roc Curve**"]},{"cell_type":"code","metadata":{"id":"SpTQqCgft1e2"},"source":["from sklearn import metrics\n","fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)\n","plt.plot(fpr, tpr)\n","plt.xlim([0.0, 1.0])\n","plt.ylim([0.0, 1.0])\n","plt.title('Receiver Operating Characteristic of AB')\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.grid(True)\n","print('\\n\\n\\n FPR, TPR, and Thresholds')\n","print(\"False Positive Rate:\", fpr) # value between 0 and 1\n","print(\"True Positive Rate:\", tpr) # value between 0 and 1, The curve is based on fpr and tpr values only.\n","print(\"Thresholds: \", thresholds) "],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"MNgq-je5uTyZ"},"source":["# XGBoost classifier"]},{"cell_type":"code","metadata":{"id":"sdkk-gsYZHMK"},"source":["import xgboost as xgb\n","xg_reg = xgb.XGBRegressor(max_depth = 5, n_estimators = 110)\n"," \n","model = xg_reg.fit(x_train,y_train)\n","\n","preds = model.predict(x_test)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"byJJHStSudPH"},"source":["**Accuracy**"]},{"cell_type":"code","metadata":{"id":"kcCtBZ6wbVaT"},"source":["from sklearn.metrics import accuracy_score\n","ACC=accuracy_score(y_test,preds.round())*100\n","print('With XgBoost accuracy is: ',ACC)"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import cohen_kappa_score\n","\n","cohen_kappa_score(y_test,preds.round())"],"metadata":{"id":"bAvc5MljoUWf"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SV9jZuZlujBI"},"source":["**Confusion Matrix**"]},{"cell_type":"code","metadata":{"id":"InXC1tu2uuH4"},"source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,preds.round())\n","print(cm)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"orMeqAOlw5-y"},"source":["**Performance/Calssification_Report**"]},{"cell_type":"code","metadata":{"id":"4l-xAiJKw_9O"},"source":["from sklearn.metrics import classification_report\n","print(\"Classification Report: \\n\", classification_report(y_test,preds.round()))"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"3f-ogN-CxWyv"},"source":["**ROC Curve**"]},{"cell_type":"code","metadata":{"id":"Js_WtG8KxbKS"},"source":["from sklearn import metrics\n","fpr, tpr, thresholds = metrics.roc_curve(y_test,preds.round())\n","plt.plot(fpr, tpr)\n","plt.xlim([0.0, 1.0])\n","plt.ylim([0.0, 1.0])\n","plt.title('Receiver Operating Characteristic of XGB')\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.grid(True)\n","print('\\n\\n\\n FPR, TPR, and Thresholds')\n","print(\"False Positive Rate:\", fpr) # value between 0 and 1\n","print(\"True Positive Rate:\", tpr) # value between 0 and 1, The curve is based on fpr and tpr values only.\n","print(\"Thresholds: \", thresholds) "],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Bagging"],"metadata":{"id":"vFkSB-9JOBxa"}},{"cell_type":"code","metadata":{"id":"x2NJoS1obgnU"},"source":["from sklearn.ensemble import BaggingClassifier\n","from sklearn.svm import SVC\n","model = BaggingClassifier(base_estimator = SVC(),\n"," n_estimators = 200,\n"," random_state = 8)\n","model.fit(x_train, y_train)\n","bag_predict = model.predict(x_test)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"r-G1k8kwj2EC"},"source":["ACC=accuracy_score(y_test,bag_predict)*100\n","print('With Bagging accuracy is: ',ACC)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["kappa values"],"metadata":{"id":"js4D8hMenULn"}},{"cell_type":"code","source":["from sklearn.metrics import cohen_kappa_score\n","\n","cohen_kappa_score(y_test,bag_predict)"],"metadata":{"id":"tzbTXekzoiJi"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,bag_predict)\n","print(cm)"],"metadata":{"id":"DeQs3eFvFqh8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import classification_report\n","print(\"Classification Report: \\n\", classification_report(y_test, bag_predict))"],"metadata":{"id":"Ax92pvnWOvoc"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Voting"],"metadata":{"id":"jc8sGrmWOJT7"}},{"cell_type":"code","metadata":{"id":"vM_mwvkzkCy6"},"source":["from sklearn.ensemble import VotingClassifier\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.neighbors import KNeighborsClassifier\n","knn_clf=KNeighborsClassifier()\n","lr=LogisticRegression()\n","#instantiating three classifiers\n","logReg= LogisticRegression()\n","dTree= DecisionTreeClassifier()\n","svm= SVC()\n","voting_clf = VotingClassifier(estimators=[('SVC', svm), ('DecisionTree',dTree), ('LogReg', logReg), ('KNN', knn_clf)], voting='hard')\n","#fit and predict using training and testing dataset respectively\n","voting_clf.fit(x_train, y_train)\n","vt_predict = voting_clf.predict(x_test)\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"qULMMGbUkvkO"},"source":["ACC=accuracy_score(y_test,vt_predict)*100\n","print('With Voting accuracy is: ',ACC)"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import cohen_kappa_score\n","\n","cohen_kappa_score(y_test,vt_predict)"],"metadata":{"id":"mgNuUCTJotdD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,vt_predict)\n","print(cm)"],"metadata":{"id":"H9zmN6u5UIOO"},"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"0XfMLHiOn7I7"},"source":["from sklearn.metrics import classification_report\n","print(\"Classification Report: \\n\", classification_report(y_test,vt_predict))"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn import metrics\n","fpr, tpr, thresholds = metrics.roc_curve(y_test,vt_predict)\n","plt.plot(fpr, tpr)\n","plt.xlim([0.0, 1.0])\n","plt.ylim([0.0, 1.0])\n","plt.title('Receiver Operating Characteristic of Voting')\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.grid(True)\n","print('\\n\\n\\n FPR, TPR, and Thresholds')\n","print(\"False Positive Rate:\", fpr) # value between 0 and 1\n","print(\"True Positive Rate:\", tpr) # value between 0 and 1, The curve is based on fpr and tpr values only.\n","print(\"Thresholds: \", thresholds)"],"metadata":{"id":"8GjhZj7ZPjV5"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Random Forest"],"metadata":{"id":"lI8v8PC2PlSA"}},{"cell_type":"code","metadata":{"id":"EpNLHpaazmUy"},"source":["from sklearn.ensemble import RandomForestClassifier\n","\n","rf=RandomForestClassifier(n_estimators=10, criterion = 'entropy', max_depth=10, random_state=8)\n","\n","rf.fit(x_train,y_train)\n","\n","rf_pred=rf.predict(x_test)"],"execution_count":null,"outputs":[]},{"cell_type":"code","source":["ACC=accuracy_score(y_test,rf_pred)*100\n","print('With Random Forest accuracy is: ',ACC)"],"metadata":{"id":"gLi3ne6DP0aI"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import cohen_kappa_score\n","\n","cohen_kappa_score(y_test,rf_pred)"],"metadata":{"id":"ePE0UsN0o8Vj"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix\n","cm = confusion_matrix(y_test,rf_pred)\n","print(cm)"],"metadata":{"id":"SWqZf2wxQCtE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["\n","from sklearn.metrics import classification_report\n","print(\"Classification Report: \\n\", classification_report(y_test,rf_pred))"],"metadata":{"id":"L1CSR6LUTl4S"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn import metrics\n","fpr, tpr, thresholds = metrics.roc_curve(y_test,rf_pred)\n","plt.plot(fpr, tpr)\n","plt.xlim([0.0, 1.0])\n","plt.ylim([0.0, 1.0])\n","plt.title('Receiver Operating Characteristic of RF')\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.grid(True)\n","print('\\n\\n\\n FPR, TPR, and Thresholds')\n","print(\"False Positive Rate:\", fpr) # value between 0 and 1\n","print(\"True Positive Rate:\", tpr) # value between 0 and 1, The curve is based on fpr and tpr values only.\n","print(\"Thresholds: \", thresholds) "],"metadata":{"id":"QldqXLxDWQJU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"233u1gZ2gbsB"},"execution_count":null,"outputs":[]}]}