{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport json\nimport tensorflow.keras.layers as L\nimport keras.backend as K\nimport tensorflow as tf\nimport plotly.express as px\nfrom sklearn.model_selection import StratifiedKFold, KFold, GroupKFold\nfrom sklearn.cluster import KMeans\nimport os\n\nos.environ['CUDA_VISIBLE_DEVICES'] = '0'\ndef allocate_gpu_memory(gpu_number=0):\n physical_devices = tf.config.experimental.list_physical_devices('GPU')\n\n if physical_devices:\n try:\n print(\"Found {} GPU(s)\".format(len(physical_devices)))\n tf.config.set_visible_devices(physical_devices[gpu_number], 'GPU')\n tf.config.experimental.set_memory_growth(physical_devices[gpu_number], True)\n print(\"#{} GPU memory is allocated\".format(gpu_number))\n except RuntimeError as e:\n print(e)\n else:\n print(\"Not enough GPU hardware devices available\")\nallocate_gpu_memory()\n\nVer='GRU_LSTM1'\naug_data = '../input/augmentation/aug_data1.csv'\ndebug = False","metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def gru_layer(hidden_dim, dropout):\n return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))\n\ndef lstm_layer(hidden_dim, dropout):\n return L.Bidirectional(L.LSTM(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))\n\ndef build_model(seq_len=107, pred_len=68, dropout=0.5, embed_dim=100, hidden_dim=256, type=1):\n inputs = L.Input(shape=(seq_len, 3))\n \n # split categorical and numerical features and concatenate them later.\n categorical_feat_dim = 3\n categorical_fea = inputs[:, :, :categorical_feat_dim]\n numerical_fea = inputs[:, :, 5:]\n\n embed = L.Embedding(input_dim=len(token2int), output_dim=embed_dim)(categorical_fea)\n reshaped = tf.reshape(embed, shape=(-1, embed.shape[1], embed.shape[2] * embed.shape[3]))\n #reshaped = L.concatenate([reshaped, numerical_fea], axis=2)\n \n\n if type == 0:\n hidden = gru_layer(hidden_dim, dropout)(reshaped)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n \n elif type == 1:\n hidden = lstm_layer(hidden_dim, dropout)(reshaped)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n \n elif type == 2:\n hidden = gru_layer(hidden_dim, dropout)(reshaped)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n \n elif type == 3:\n hidden = gru_layer(hidden_dim, dropout)(reshaped)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n elif type == 4:\n hidden = lstm_layer(hidden_dim, dropout)(reshaped)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n \n \n \n truncated = hidden[:, :pred_len]\n out = L.Dense(5, activation='linear')(truncated)\n model = tf.keras.Model(inputs=inputs, outputs=out)\n model.compile(tf.keras.optimizers.Adam(), loss=mcrmse)\n return model","metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}\npred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']\n\ndef preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):\n base_fea = np.transpose(\n np.array(\n df[cols]\n .applymap(lambda seq: [token2int[x] for x in seq])\n .values\n .tolist()\n ),\n (0, 2, 1)\n )\n bpps_sum_fea = np.array(df['bpps_sum'].to_list())[:,:,np.newaxis]\n bpps_max_fea = np.array(df['bpps_max'].to_list())[:,:,np.newaxis]\n bpps_nb_fea = np.array(df['bpps_nb'].to_list())[:,:,np.newaxis]\n bpps_v_fea = np.array(df['bpps_v'].to_list())[:,:,np.newaxis]\n bpps_m_fea = np.array(df['bpps_m'].to_list())[:,:,np.newaxis]\n return np.concatenate([base_fea,bpps_sum_fea,bpps_max_fea,bpps_nb_fea,bpps_v_fea,bpps_m_fea], 2)\n return base_fea\n\ndef rmse(y_actual, y_pred):\n mse = tf.keras.losses.mean_squared_error(y_actual, y_pred)\n return K.sqrt(mse)\n\ndef mcrmse(y_actual, y_pred, num_scored=len(pred_cols)):\n score = 0\n for i in range(num_scored):\n score += rmse(y_actual[:, :, i], y_pred[:, :, i]) / num_scored\n return score","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)\ntest = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"aug_data = '../input/augmentation/aug_data1.csv'\ndebug = False\naug_df = pd.read_csv(aug_data)\ndef aug_data(df):\n target_df = df.copy()\n new_df = aug_df[aug_df['id'].isin(target_df['id'])]\n \n del target_df['structure']\n del target_df['predicted_loop_type']\n new_df = new_df.merge(target_df, on=['id','sequence'], how='left')\n\n df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])\n df['log_gamma'] = 100\n df['score'] = 1.0\n df = df.append(new_df[df.columns])\n return df\ntrain = aug_data(train)\ntest = aug_data(test)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train.head()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# additional features\n\ndef read_bpps_sum(df):\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps_arr.append(np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\").sum(axis=1))\n return bpps_arr\n\ndef read_bpps_max(df):\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps_arr.append(np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\").max(axis=1))\n return bpps_arr\n\ndef read_bpps_nb(df):\n # normalized non-zero number\n # from https://www.kaggle.com/symyksr/openvaccine-deepergcn \n bpps_nb_mean = 0.077522 # mean of bpps_nb across all training data\n bpps_nb_std = 0.08914 # std of bpps_nb across all training data\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps = np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\")\n bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]\n bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std\n bpps_arr.append(bpps_nb)\n return bpps_arr \ndef read_bpps_m(df):\n e=0.00000001\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps = np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\")\n vec=[]\n for i in range(bpps.shape[0]):\n m=0\n l=0\n for j in range(bpps.shape[0]):\n if bpps[i][j]>0:\n m=m+(j*bpps[i][j])\n l=l+1\n m=m/(l+e)\n vec.append(m)\n bpps_arr.append(vec)\n return bpps_arr \n\ndef read_bpps_v(df):\n b = 0.9 # beta for exponential weaghted average with bias correction\n e=0.00000001\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps = np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\")\n vec=[]\n for i in range(bpps.shape[0]):\n v=0\n m=0\n l=0\n for j in range(bpps.shape[0]):\n if bpps[i][j]>0:\n v=(b*v+(1-b)*bpps[i][j])\n m=m+v\n l=l+1\n m=m/(l+e)\n vec.append(m)\n bpps_arr.append(vec)\n return bpps_arr \n\n\ntrain['bpps_sum'] = read_bpps_sum(train)\ntest['bpps_sum'] = read_bpps_sum(test)\ntrain['bpps_max'] = read_bpps_max(train)\ntest['bpps_max'] = read_bpps_max(test)\ntrain['bpps_nb'] = read_bpps_nb(train)\ntest['bpps_nb'] = read_bpps_nb(test)\ntrain['bpps_v'] = read_bpps_v(train)\ntest['bpps_v'] = read_bpps_v(test)\ntrain['bpps_m'] = read_bpps_m(train)\ntest['bpps_m'] = read_bpps_m(test)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split, KFold\ntarget_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']\ntrain_inputs = preprocess_inputs(train[train.signal_to_noise > 1])\ntrain_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))\ntrain_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, test_size=.1, random_state=34)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_inputs.shape\n#val_inputs.shape","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lr_callback = tf.keras.callbacks.ReduceLROnPlateau()\n\ngru = build_model(type=0)\nsv_gru = tf.keras.callbacks.ModelCheckpoint('model_gru.h5')\n\nhistory_gru = gru.fit(\n train_inputs, train_labels, \n validation_data=(val_inputs,val_labels),\n batch_size=64,\n epochs=100,\n callbacks=[lr_callback,sv_gru],\n verbose = 2\n)\n\nprint(f\"Min training loss={min(history_gru.history['loss'])}, min validation loss={min(history_gru.history['val_loss'])}\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig = px.line(history_gru.history, y=['loss', 'val_loss'], labels={'index': 'epoch', 'value': 'Mean Columnwise Root Mean Squared Error'}, title='Training History')\nfig.show()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lr_callback = tf.keras.callbacks.ReduceLROnPlateau()\n\nlstm = build_model(type=4)\nsv_lstm = tf.keras.callbacks.ModelCheckpoint('model_lstm.h5')\n\nhistory_lstm = lstm.fit(\n train_inputs, train_labels, \n validation_data=(val_inputs,val_labels),\n batch_size=64,\n epochs=100,\n callbacks=[lr_callback,sv_lstm],\n verbose = 2\n)\n\n","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(f\"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig = px.line(history_lstm.history, y=['loss', 'val_loss'], labels={'index': 'epoch', 'value': 'Mean Columnwise Root Mean Squared Error'}, title='Training History')\nfig.show()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lr_callback = tf.keras.callbacks.ReduceLROnPlateau()\n\nhyprid = build_model(type=2)\nsv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyprid.h5')\n\nhistory_hyprid = hyprid.fit(\n train_inputs, train_labels, \n validation_data=(val_inputs,val_labels),\n batch_size=64,\n epochs=100,\n callbacks=[lr_callback,sv_lstm],\n verbose = 2\n)\n\nprint(f\"Min training loss={min(history_hyprid.history['loss'])}, min validation loss={min(history_hyprid.history['val_loss'])}\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(f\"Min training loss={min(history_hyprid.history['loss'])}, min validation loss={min(history_hyprid.history['val_loss'])}\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig = px.line(history_hyprid.history, y=['loss', 'val_loss'], labels={'index': 'epoch', 'value': 'Mean Columnwise Root Mean Squared Error'}, title='Training History')\nfig.show()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}