{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1d56aa6c",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import math\n",
"from tabulate import tabulate\n",
"from difflib import SequenceMatcher\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"from collections import defaultdict\n",
"from itertools import chain, combinations\n",
"from fpgrowth_py import fpgrowth\n",
"import time\n",
"\n",
"from dna import dna\n",
"from scov import numpy_image_dict\n",
"from helper import *\n",
"import zlib\n",
"import lzma"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1d03a554",
"metadata": {},
"outputs": [],
"source": [
"input_file1 = r'./input/China_Seq.txt'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ddb79f61",
"metadata": {},
"outputs": [],
"source": [
"input_file2= r'./input/USA_Seq.txt'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "07294521",
"metadata": {},
"outputs": [],
"source": [
"df_sequence = pd.DataFrame(columns=[\"line no\",\"sequence ratios\"])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ae2da989",
"metadata": {},
"outputs": [],
"source": [
"count_lines=0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "adf7ace4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" line no | \n",
" sequence ratios | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [line no, sequence ratios]\n",
"Index: []"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_sequence"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d32d1210",
"metadata": {},
"outputs": [],
"source": [
"current_start_time = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a48476fe",
"metadata": {},
"outputs": [],
"source": [
"with open(input_file1) as file1, open(input_file2) as file2:\n",
" for lineno, (sequence1, sequence2) in enumerate(zip(file1, file2), 1):\n",
" sequence = SequenceMatcher(a=sequence1 , b=sequence2) #comparing both the strings\n",
" #print(lineno,\" - \",sequence.ratio())\n",
" df_sequence.loc[len(df_sequence.index)] = [lineno,sequence.ratio()]\n",
" df_sequence['line no'] = df_sequence['line no'].astype(int)\n",
" count_lines+=1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "761e7493",
"metadata": {},
"outputs": [],
"source": [
"current_end_time = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "54c99034",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Time : 1.7870872020721436\n"
]
}
],
"source": [
"print(\"Total Time :\",current_end_time-current_start_time)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "24d5ea72",
"metadata": {},
"outputs": [],
"source": [
"df_sequence_missings = df_sequence[df_sequence['sequence ratios']<1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f74e49b1",
"metadata": {},
"outputs": [],
"source": [
"length_changed_genome = len(df_sequence_missings)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "d7393abd",
"metadata": {},
"outputs": [],
"source": [
"perc_missing_values = (length_changed_genome/count_lines)*100"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "63ddc7c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Percentage of changed Genome Sequence 1.31 %\n"
]
}
],
"source": [
"print(\"Percentage of changed Genome Sequence \",round(perc_missing_values,2),\"%\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "64ef5d9d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" line no | \n",
" sequence ratios | \n",
"
\n",
" \n",
" \n",
" \n",
" 121 | \n",
" 122 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 249 | \n",
" 250 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 251 | \n",
" 252 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 254 | \n",
" 255 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 426 | \n",
" 427 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 589 | \n",
" 590 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 601 | \n",
" 602 | \n",
" 0.985915 | \n",
"
\n",
" \n",
" 611 | \n",
" 612 | \n",
" 0.989474 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" line no sequence ratios\n",
"121 122 0.985915\n",
"249 250 0.985915\n",
"251 252 0.985915\n",
"254 255 0.985915\n",
"426 427 0.985915\n",
"589 590 0.985915\n",
"601 602 0.985915\n",
"611 612 0.989474"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_sequence_missings"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "5b7fab09",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df_sequence_missings['sequence ratios'].plot(kind='bar')\n",
"\n",
"plt.ylabel(\"Sequence Ration < 1\")\n",
"plt.xlabel(\"Index / Sequence number where genomes are missing\")\n",
"\n",
"#add custom legend to bar chart\n",
"plt.legend(['Index of Missing Sequences'], prop={'size': 20})"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d12ca912",
"metadata": {},
"outputs": [],
"source": [
"sequence1_All=''\n",
"sequence2_All=''"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e5789852",
"metadata": {},
"outputs": [],
"source": [
"with open(input_file1) as file1:\n",
" for lineno in file1:\n",
" lineno=lineno.strip('\\n')\n",
" sequence1_All+=lineno"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b6064bd9",
"metadata": {},
"outputs": [],
"source": [
"with open(input_file2) as file2:\n",
" for lineno in file2:\n",
" lineno=lineno.strip('\\n')\n",
" sequence2_All+=lineno"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f8b08d7a",
"metadata": {},
"outputs": [],
"source": [
"def basic_Analysis(DNAseq):\n",
" total_base = len(DNAseq)\n",
" num_Adenine = DNAseq.count('A')\n",
" num_Guanine = DNAseq.count('G')\n",
" num_Thymine = DNAseq.count('T')\n",
" num_Cytosine = DNAseq.count('C')\n",
" \n",
" if total_base != num_Adenine + num_Guanine + num_Thymine + num_Cytosine:\n",
" print('Something is not right')\n",
" else : pass\n",
" \n",
" A_percent = num_Adenine / total_base\n",
" G_percent = num_Guanine / total_base\n",
" T_percent = num_Thymine / total_base\n",
" C_percent = num_Cytosine / total_base\n",
" \n",
" #visualization\n",
" x = np.arange(4)\n",
" bases = ['Adenine', 'Guanine', 'Thymine' ,'Cytosine']\n",
" values = [num_Adenine, num_Guanine, num_Thymine, num_Cytosine]\n",
" plt.bar(x,values)\n",
" plt.xticks(x, bases)\n",
" plt.show()\n",
" table = [['total base',total_base,'Percentage',str('100%')],\n",
" ['Adenine:',num_Adenine, 'Percentage:',str(round(A_percent*100,2))+'%'],\n",
" ['Guanine:',num_Guanine, 'Percentage:',str(round(G_percent*100,2))+'%'],\n",
" ['Thynime:',num_Thymine, 'Percentage:',str(round(T_percent*100,2))+'%'],\n",
" ['Cytosine:',num_Cytosine, 'Percentage:',str(round(C_percent*100,2))+'%']]\n",
" print(tabulate(table))\n",
" print('GC content:', round((((num_Guanine + num_Cytosine) / total_base)*100),2),'%')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "61655d2a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVUUlEQVR4nO3df5TddX3n8eerZFERISBTikmOyZGs3UAphVnIrv1hxRMCeky6pQrrlkCz5myLu3Wtq2B3mxbLEXd7ykqrdFNJCR7Kj0UsOYLGHMDVtYBMAIEQKSM/JDkgIwnolhUMvveP+5nlMsyQmbmTmYQ8H+fcc7/f9/fz+X4/98vNvO73x72kqpAk7dt+ZqYHIEmaeYaBJMkwkCQZBpIkDANJEjBrpgcwWYcddljNnz9/pochSXuVTZs2/aCq+kbW99owmD9/PgMDAzM9DEnaqyR5dLS6p4kkSbsOgyRrkzyZ5L5Rlv1BkkpyWJtPkouTDCa5J8lxXW1XJHmwPVZ01Y9Pcm/rc3GSTNWLkySNz3iODC4Dlo4sJpkHLAG+11U+BVjYHquAS1rbQ4HVwInACcDqJIe0PpcAH+jq97JtSZJ2r12GQVV9Hdg+yqKLgI8C3b9nsQy4vDpuA2YnOQI4GdhYVduragewEVjalh1UVbdV53cxLgeW9/SKJEkTNqlrBkmWAduq6tsjFs0BHuua39pqr1TfOkp9rO2uSjKQZGBoaGgyQ5ckjWLCYZDkAODjwB9N/XBeWVWtqar+qurv63vZnVGSpEmazJHBW4AFwLeTPALMBe5M8nPANmBeV9u5rfZK9bmj1CVJ02jCYVBV91bVz1bV/KqaT+fUznFV9QSwHjiz3VW0GHimqh4HNgBLkhzSLhwvATa0ZT9MsrjdRXQmcP0UvTZJ0jiN59bSK4Fbgbcm2Zpk5Ss0vxF4CBgE/hr4PYCq2g58ArijPc5vNVqbz7U+3wW+PLmXIkmarOyt/3Ob/v7+8hvI2hvNP/eGmR7CjHrkwnfN9BD2aUk2VVX/yLrfQJYkGQaSJMNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJKAWTM9gJngTwj7E8KSXsojA0mSYSBJMgwkSRgGkiQMA0kShoEkiXGEQZK1SZ5Mcl9X7b8l+U6Se5J8McnsrmXnJRlM8kCSk7vqS1ttMMm5XfUFSW5v9auT7D+Fr0+SNA7jOTK4DFg6orYROLqqjgH+ATgPIMki4HTgqNbns0n2S7If8BngFGARcEZrC/Ap4KKqOhLYAazs6RVJkiZsl2FQVV8Hto+ofbWqdrbZ24C5bXoZcFVVPVdVDwODwAntMVhVD1XV88BVwLIkAd4BXNv6rwOW9/aSJEkTNRXXDH4H+HKbngM81rVsa6uNVX8j8HRXsAzXJUnTqKcwSPKHwE7giqkZzi63tyrJQJKBoaGh6dikJO0TJh0GSc4C3g28v6qqlbcB87qazW21sepPAbOTzBpRH1VVramq/qrq7+vrm+zQJUkjTCoMkiwFPgq8p6qe7Vq0Hjg9yWuSLAAWAt8C7gAWtjuH9qdzkXl9C5FbgNNa/xXA9ZN7KZKkyRrPraVXArcCb02yNclK4C+BNwAbk9yd5K8AqmozcA1wP/AV4JyqeqFdE/ggsAHYAlzT2gJ8DPhwkkE61xAundJXKEnapV3+hHVVnTFKecw/2FV1AXDBKPUbgRtHqT9E524jSdIM8RvIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRLjCIMka5M8meS+rtqhSTYmebA9H9LqSXJxksEk9yQ5rqvPitb+wSQruurHJ7m39bk4Sab6RUqSXtl4jgwuA5aOqJ0L3FRVC4Gb2jzAKcDC9lgFXAKd8ABWAycCJwCrhwOktflAV7+R25Ik7Wa7DIOq+jqwfUR5GbCuTa8DlnfVL6+O24DZSY4ATgY2VtX2qtoBbASWtmUHVdVtVVXA5V3rkiRNk8leMzi8qh5v008Ah7fpOcBjXe22ttor1beOUh9VklVJBpIMDA0NTXLokqSRer6A3D7R1xSMZTzbWlNV/VXV39fXNx2blKR9wmTD4PvtFA/t+clW3wbM62o3t9VeqT53lLokaRpNNgzWA8N3BK0Aru+qn9nuKloMPNNOJ20AliQ5pF04XgJsaMt+mGRxu4vozK51SZKmyaxdNUhyJfB24LAkW+ncFXQhcE2SlcCjwHtb8xuBU4FB4FngbICq2p7kE8Adrd35VTV8Ufr36Nyx9Drgy+0hSZpGuwyDqjpjjEUnjdK2gHPGWM9aYO0o9QHg6F2NQ5K0+/gNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIYx//2Uhpp/rk3zPQQZtQjF75rpocgTTmPDCRJhoEkqccwSPIfk2xOcl+SK5O8NsmCJLcnGUxydZL9W9vXtPnBtnx+13rOa/UHkpzc42uSJE3QpMMgyRzgPwD9VXU0sB9wOvAp4KKqOhLYAaxsXVYCO1r9otaOJItav6OApcBnk+w32XFJkiau19NEs4DXJZkFHAA8DrwDuLYtXwcsb9PL2jxt+UlJ0upXVdVzVfUwMAic0OO4JEkTMOkwqKptwJ8B36MTAs8Am4Cnq2pna7YVmNOm5wCPtb47W/s3dtdH6fMSSVYlGUgyMDQ0NNmhS5JG6OU00SF0PtUvAN4EvJ7OaZ7dpqrWVFV/VfX39fXtzk1J0j6ll9NE7wQerqqhqvoJcB3wNmB2O20EMBfY1qa3AfMA2vKDgae666P0kSRNg17C4HvA4iQHtHP/JwH3A7cAp7U2K4Dr2/T6Nk9bfnNVVauf3u42WgAsBL7Vw7gkSRM06W8gV9XtSa4F7gR2AncBa4AbgKuS/GmrXdq6XAp8PskgsJ3OHURU1eYk19AJkp3AOVX1wmTHJUmauJ5+jqKqVgOrR5QfYpS7garqx8BvjbGeC4ALehmLpH2DP4eye34OxW8gS5IMA0mSYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkugxDJLMTnJtku8k2ZLkXyQ5NMnGJA+250Na2yS5OMlgknuSHNe1nhWt/YNJVvT6oiRJE9PrkcGnga9U1c8DvwhsAc4FbqqqhcBNbR7gFGBhe6wCLgFIciiwGjgROAFYPRwgkqTpMekwSHIw8KvApQBV9XxVPQ0sA9a1ZuuA5W16GXB5ddwGzE5yBHAysLGqtlfVDmAjsHSy45IkTVwvRwYLgCHgb5LcleRzSV4PHF5Vj7c2TwCHt+k5wGNd/be22lj1l0myKslAkoGhoaEehi5J6tZLGMwCjgMuqapfAv6RF08JAVBVBVQP23iJqlpTVf1V1d/X1zdVq5WkfV4vYbAV2FpVt7f5a+mEw/fb6R/a85Nt+TZgXlf/ua02Vl2SNE0mHQZV9QTwWJK3ttJJwP3AemD4jqAVwPVtej1wZruraDHwTDudtAFYkuSQduF4SatJkqbJrB77/3vgiiT7Aw8BZ9MJmGuSrAQeBd7b2t4InAoMAs+2tlTV9iSfAO5o7c6vqu09jkuSNAE9hUFV3Q30j7LopFHaFnDOGOtZC6ztZSySpMnzG8iSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkpiAMkuyX5K4kX2rzC5LcnmQwydVJ9m/117T5wbZ8ftc6zmv1B5Kc3OuYJEkTMxVHBr8PbOma/xRwUVUdCewAVrb6SmBHq1/U2pFkEXA6cBSwFPhskv2mYFySpHHqKQySzAXeBXyuzQd4B3Bta7IOWN6ml7V52vKTWvtlwFVV9VxVPQwMAif0Mi5J0sT0emTw34GPAj9t828Enq6qnW1+KzCnTc8BHgNoy59p7f9/fZQ+L5FkVZKBJANDQ0M9Dl2SNGzSYZDk3cCTVbVpCsfziqpqTVX1V1V/X1/fdG1Wkl71ZvXQ923Ae5KcCrwWOAj4NDA7yaz26X8usK213wbMA7YmmQUcDDzVVR/W3UeSNA0mfWRQVedV1dyqmk/nAvDNVfV+4BbgtNZsBXB9m17f5mnLb66qavXT291GC4CFwLcmOy5J0sT1cmQwlo8BVyX5U+Au4NJWvxT4fJJBYDudAKGqNie5Brgf2AmcU1Uv7IZxSZLGMCVhUFVfA77Wph9ilLuBqurHwG+N0f8C4IKpGIskaeL8BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSSJHsIgybwktyS5P8nmJL/f6ocm2ZjkwfZ8SKsnycVJBpPck+S4rnWtaO0fTLKi95clSZqIXo4MdgJ/UFWLgMXAOUkWAecCN1XVQuCmNg9wCrCwPVYBl0AnPIDVwInACcDq4QCRJE2PSYdBVT1eVXe26R8BW4A5wDJgXWu2DljeppcBl1fHbcDsJEcAJwMbq2p7Ve0ANgJLJzsuSdLETck1gyTzgV8CbgcOr6rH26IngMPb9Bzgsa5uW1ttrLokaZr0HAZJDgS+AHyoqn7YvayqCqhet9G1rVVJBpIMDA0NTdVqJWmf11MYJPkndILgiqq6rpW/307/0J6fbPVtwLyu7nNbbaz6y1TVmqrqr6r+vr6+XoYuSerSy91EAS4FtlTVn3ctWg8M3xG0Ari+q35mu6toMfBMO520AViS5JB24XhJq0mSpsmsHvq+Dfht4N4kd7fax4ELgWuSrAQeBd7blt0InAoMAs8CZwNU1fYknwDuaO3Or6rtPYxLkjRBkw6DqvrfQMZYfNIo7Qs4Z4x1rQXWTnYskqTe+A1kSZJhIEkyDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIk9KAySLE3yQJLBJOfO9HgkaV+yR4RBkv2AzwCnAIuAM5IsmtlRSdK+Y48IA+AEYLCqHqqq54GrgGUzPCZJ2mekqmZ6DCQ5DVhaVf+2zf82cGJVfXBEu1XAqjb7VuCBaR3o1DkM+MFMD2Iv5v7rjfuvN3v7/ntzVfWNLM6aiZFMVlWtAdbM9Dh6lWSgqvpnehx7K/dfb9x/vXm17r895TTRNmBe1/zcVpMkTYM9JQzuABYmWZBkf+B0YP0Mj0mS9hl7xGmiqtqZ5IPABmA/YG1VbZ7hYe1Oe/2prhnm/uuN+683r8r9t0dcQJYkzaw95TSRJGkGGQaSJMNgIpIsT1JJfn6M5V9LMqlbzpK859X8MxxJDk/yt0keSrIpya1JfmOKt/Gq2odJ3pjk7vZ4Ism2Nv10kvt3w/belOTaqV7vTEvyc0muSvLd9t67Mck/HaPtsUlOneR29ur95zWDCUhyNfAm4OaqWj3K8q8BH6mqgeke254sSYC/B9ZV1V+12puB91TVX8zo4PYSSf4Y+D9V9WdJ5gNfqqqjZ3ZUe74x3nu/CBxUVd8Ypf1ZQP/IL7zuCzwyGKckBwK/DKykc+srSV7XPnFsSfJF4HVd7Ze0T793JvmfrT9JHknyJ61+7/BRRpKzkvxlm74sycVJ/r59kj6ta73/KckdSe5J8ifTtwd68g7g+eF/jABV9WhV/UX36wZI8qUkb2/TlyQZSLK5+7Xuo/twpP2S/HXbN19t78W3JLlzuEGShcPzbZ99sh1ZDCQ5LsmG9mn537U285Pc16bPSnJdkq8keTDJf+1a76jv7T3UrwM/GfHe+zbwgSTLh2tJrkiyDDgfeF/bT+9LcmiSv2vvlduSHNPa/1rXUdtdSd6wt+8/w2D8lgFfqap/AJ5Kcjzwu8CzVfXPgNXA8QBJDgP+M/DOqjoOGAA+3LWuH7T6JcBHxtjeEXTC593AhW29S4CFdH7L6Vjg+CS/OpUvcjc5Crhzl61e7g/bNz2PAX5t+B9is6/tw5EWAp+pqqOAp4HfrKrvAs8kOba1ORv4m64+36uqY4FvAJcBpwGLgbEC8VjgfcAv0PkDOW8c7+09zdHAplHqlwJnASQ5GPiXwA3AHwFXV9WxVXU1nX1zV1UdA3wcuLz1/whwTtufvwL831G2cSx70f7bI75nsJc4A/h0m76qzR8JXAxQVfckuactX0zn11e/2TlKZX/g1q51XdeeNwH/aozt/V1V/RS4P8nhrbakPe5q8wfS+aPw9cm/rOmX5DN0/kg/T+fXasfy3nR+j2oWnT/si4DhfbxP70Pg4aq6u01vAua36c8BZyf5MJ0/RCd09Rn+Iue9wIFV9SPgR0meSzJ7lG3cVFXPAKRzjeLNwGxe+b29V6iq/5Xks0n6gN8EvtC+7zSy6S+35VTVzelcxzkI+Cbw50muAK6rqq2j9N2r9p9hMA5JDqVzquMXkhSdL8YVL/5BeVkXYGNVnTHG8ufa8wuM/d/gua7pdD1/sqr+x3jHvofYTPsHBVBV57RPSAPATl56hPpagCQL6Hz6+udVtSPJZcPLmn1tH47U/dpe4MVTlF+gc5R6M7Cpqp4apc9PR/T/KaPvw5HbmMWu39t7ms10joBGcznwb+ic9j17IiutqguT3ACcSucP+8nAj0c026v2n6eJxuc04PNV9eaqml9V84CH6Xwi+9cASY6mczoD4DbgbUmObMtenzHuXpigDcDv5MXrD3OS/OwUrHd3uxl4bZLf7aod0J4fAY5N8jNJ5vHiJ9mDgH+kc9rjcDr/r4upsLfuw3Gpqh/TeY2X8NJTRFNld723d5ebgde0I0wAkhyT5FfonCr7EEBVDd+d9SPgDV39vwG8v/V7O53Tkz9M8paqureqPkXn53RGvcNwFHvs/jMMxucM4Isjal8AFgAHJtlC58LTJoCqGqJzPvLKduroVsb/ZhlTVX0V+Fvg1iT3Atfy0jfuHqk6t6wtp3Pe/+Ek3wLWAR+jc7j9MHA/nVNud7Y+36Zz5PUdOq/5m1M0lr1yH07QFXQ+7X91qle8u97bu0t77/0G8M52sXwz8Engiar6PrCFl4bmLcCi4QvIwB/Tua50D53rTitauw8lua/VfwJ8eZzj2WP3n7eWSq8yST4CHFxV/2Wmx7InS3IAnesnxw2f29+Xec1AehVJ5xbnt9C5xqUxJHknnTuKLjIIOjwykCR5zUCSZBhIkjAMJEkYBpIkDANJEvD/AG3vovyAnWp7AAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"---------- ----- ----------- ------\n",
"total base 42483 Percentage 100%\n",
"Adenine: 12688 Percentage: 29.87%\n",
"Guanine: 8393 Percentage: 19.76%\n",
"Thynime: 13709 Percentage: 32.27%\n",
"Cytosine: 7693 Percentage: 18.11%\n",
"---------- ----- ----------- ------\n",
"GC content: 37.86 %\n"
]
}
],
"source": [
"basic_Analysis(sequence1_All)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "99fef932",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVUUlEQVR4nO3df5TddX3n8eerZFERISBTikmOyZGs3UAphVnIrv1hxRMCeky6pQrrlkCz5myLu3Wtq2B3mxbLEXd7ykqrdFNJCR7Kj0UsOYLGHMDVtYBMAIEQKSM/JDkgIwnolhUMvveP+5nlMsyQmbmTmYQ8H+fcc7/f9/fz+X4/98vNvO73x72kqpAk7dt+ZqYHIEmaeYaBJMkwkCQZBpIkDANJEjBrpgcwWYcddljNnz9/pochSXuVTZs2/aCq+kbW99owmD9/PgMDAzM9DEnaqyR5dLS6p4kkSbsOgyRrkzyZ5L5Rlv1BkkpyWJtPkouTDCa5J8lxXW1XJHmwPVZ01Y9Pcm/rc3GSTNWLkySNz3iODC4Dlo4sJpkHLAG+11U+BVjYHquAS1rbQ4HVwInACcDqJIe0PpcAH+jq97JtSZJ2r12GQVV9Hdg+yqKLgI8C3b9nsQy4vDpuA2YnOQI4GdhYVduragewEVjalh1UVbdV53cxLgeW9/SKJEkTNqlrBkmWAduq6tsjFs0BHuua39pqr1TfOkp9rO2uSjKQZGBoaGgyQ5ckjWLCYZDkAODjwB9N/XBeWVWtqar+qurv63vZnVGSpEmazJHBW4AFwLeTPALMBe5M8nPANmBeV9u5rfZK9bmj1CVJ02jCYVBV91bVz1bV/KqaT+fUznFV9QSwHjiz3VW0GHimqh4HNgBLkhzSLhwvATa0ZT9MsrjdRXQmcP0UvTZJ0jiN59bSK4Fbgbcm2Zpk5Ss0vxF4CBgE/hr4PYCq2g58ArijPc5vNVqbz7U+3wW+PLmXIkmarOyt/3Ob/v7+8hvI2hvNP/eGmR7CjHrkwnfN9BD2aUk2VVX/yLrfQJYkGQaSJMNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJKAWTM9gJngTwj7E8KSXsojA0mSYSBJMgwkSRgGkiQMA0kShoEkiXGEQZK1SZ5Mcl9X7b8l+U6Se5J8McnsrmXnJRlM8kCSk7vqS1ttMMm5XfUFSW5v9auT7D+Fr0+SNA7jOTK4DFg6orYROLqqjgH+ATgPIMki4HTgqNbns0n2S7If8BngFGARcEZrC/Ap4KKqOhLYAazs6RVJkiZsl2FQVV8Hto+ofbWqdrbZ24C5bXoZcFVVPVdVDwODwAntMVhVD1XV88BVwLIkAd4BXNv6rwOW9/aSJEkTNRXXDH4H+HKbngM81rVsa6uNVX8j8HRXsAzXJUnTqKcwSPKHwE7giqkZzi63tyrJQJKBoaGh6dikJO0TJh0GSc4C3g28v6qqlbcB87qazW21sepPAbOTzBpRH1VVramq/qrq7+vrm+zQJUkjTCoMkiwFPgq8p6qe7Vq0Hjg9yWuSLAAWAt8C7gAWtjuH9qdzkXl9C5FbgNNa/xXA9ZN7KZKkyRrPraVXArcCb02yNclK4C+BNwAbk9yd5K8AqmozcA1wP/AV4JyqeqFdE/ggsAHYAlzT2gJ8DPhwkkE61xAundJXKEnapV3+hHVVnTFKecw/2FV1AXDBKPUbgRtHqT9E524jSdIM8RvIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRLjCIMka5M8meS+rtqhSTYmebA9H9LqSXJxksEk9yQ5rqvPitb+wSQruurHJ7m39bk4Sab6RUqSXtl4jgwuA5aOqJ0L3FRVC4Gb2jzAKcDC9lgFXAKd8ABWAycCJwCrhwOktflAV7+R25Ik7Wa7DIOq+jqwfUR5GbCuTa8DlnfVL6+O24DZSY4ATgY2VtX2qtoBbASWtmUHVdVtVVXA5V3rkiRNk8leMzi8qh5v008Ah7fpOcBjXe22ttor1beOUh9VklVJBpIMDA0NTXLokqSRer6A3D7R1xSMZTzbWlNV/VXV39fXNx2blKR9wmTD4PvtFA/t+clW3wbM62o3t9VeqT53lLokaRpNNgzWA8N3BK0Aru+qn9nuKloMPNNOJ20AliQ5pF04XgJsaMt+mGRxu4vozK51SZKmyaxdNUhyJfB24LAkW+ncFXQhcE2SlcCjwHtb8xuBU4FB4FngbICq2p7kE8Adrd35VTV8Ufr36Nyx9Drgy+0hSZpGuwyDqjpjjEUnjdK2gHPGWM9aYO0o9QHg6F2NQ5K0+/gNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIYx//2Uhpp/rk3zPQQZtQjF75rpocgTTmPDCRJhoEkqccwSPIfk2xOcl+SK5O8NsmCJLcnGUxydZL9W9vXtPnBtnx+13rOa/UHkpzc42uSJE3QpMMgyRzgPwD9VXU0sB9wOvAp4KKqOhLYAaxsXVYCO1r9otaOJItav6OApcBnk+w32XFJkiau19NEs4DXJZkFHAA8DrwDuLYtXwcsb9PL2jxt+UlJ0upXVdVzVfUwMAic0OO4JEkTMOkwqKptwJ8B36MTAs8Am4Cnq2pna7YVmNOm5wCPtb47W/s3dtdH6fMSSVYlGUgyMDQ0NNmhS5JG6OU00SF0PtUvAN4EvJ7OaZ7dpqrWVFV/VfX39fXtzk1J0j6ll9NE7wQerqqhqvoJcB3wNmB2O20EMBfY1qa3AfMA2vKDgae666P0kSRNg17C4HvA4iQHtHP/JwH3A7cAp7U2K4Dr2/T6Nk9bfnNVVauf3u42WgAsBL7Vw7gkSRM06W8gV9XtSa4F7gR2AncBa4AbgKuS/GmrXdq6XAp8PskgsJ3OHURU1eYk19AJkp3AOVX1wmTHJUmauJ5+jqKqVgOrR5QfYpS7garqx8BvjbGeC4ALehmLpH2DP4eye34OxW8gS5IMA0mSYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkugxDJLMTnJtku8k2ZLkXyQ5NMnGJA+250Na2yS5OMlgknuSHNe1nhWt/YNJVvT6oiRJE9PrkcGnga9U1c8DvwhsAc4FbqqqhcBNbR7gFGBhe6wCLgFIciiwGjgROAFYPRwgkqTpMekwSHIw8KvApQBV9XxVPQ0sA9a1ZuuA5W16GXB5ddwGzE5yBHAysLGqtlfVDmAjsHSy45IkTVwvRwYLgCHgb5LcleRzSV4PHF5Vj7c2TwCHt+k5wGNd/be22lj1l0myKslAkoGhoaEehi5J6tZLGMwCjgMuqapfAv6RF08JAVBVBVQP23iJqlpTVf1V1d/X1zdVq5WkfV4vYbAV2FpVt7f5a+mEw/fb6R/a85Nt+TZgXlf/ua02Vl2SNE0mHQZV9QTwWJK3ttJJwP3AemD4jqAVwPVtej1wZruraDHwTDudtAFYkuSQduF4SatJkqbJrB77/3vgiiT7Aw8BZ9MJmGuSrAQeBd7b2t4InAoMAs+2tlTV9iSfAO5o7c6vqu09jkuSNAE9hUFV3Q30j7LopFHaFnDOGOtZC6ztZSySpMnzG8iSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkpiAMkuyX5K4kX2rzC5LcnmQwydVJ9m/117T5wbZ8ftc6zmv1B5Kc3OuYJEkTMxVHBr8PbOma/xRwUVUdCewAVrb6SmBHq1/U2pFkEXA6cBSwFPhskv2mYFySpHHqKQySzAXeBXyuzQd4B3Bta7IOWN6ml7V52vKTWvtlwFVV9VxVPQwMAif0Mi5J0sT0emTw34GPAj9t828Enq6qnW1+KzCnTc8BHgNoy59p7f9/fZQ+L5FkVZKBJANDQ0M9Dl2SNGzSYZDk3cCTVbVpCsfziqpqTVX1V1V/X1/fdG1Wkl71ZvXQ923Ae5KcCrwWOAj4NDA7yaz26X8usK213wbMA7YmmQUcDDzVVR/W3UeSNA0mfWRQVedV1dyqmk/nAvDNVfV+4BbgtNZsBXB9m17f5mnLb66qavXT291GC4CFwLcmOy5J0sT1cmQwlo8BVyX5U+Au4NJWvxT4fJJBYDudAKGqNie5Brgf2AmcU1Uv7IZxSZLGMCVhUFVfA77Wph9ilLuBqurHwG+N0f8C4IKpGIskaeL8BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSSJHsIgybwktyS5P8nmJL/f6ocm2ZjkwfZ8SKsnycVJBpPck+S4rnWtaO0fTLKi95clSZqIXo4MdgJ/UFWLgMXAOUkWAecCN1XVQuCmNg9wCrCwPVYBl0AnPIDVwInACcDq4QCRJE2PSYdBVT1eVXe26R8BW4A5wDJgXWu2DljeppcBl1fHbcDsJEcAJwMbq2p7Ve0ANgJLJzsuSdLETck1gyTzgV8CbgcOr6rH26IngMPb9Bzgsa5uW1ttrLokaZr0HAZJDgS+AHyoqn7YvayqCqhet9G1rVVJBpIMDA0NTdVqJWmf11MYJPkndILgiqq6rpW/307/0J6fbPVtwLyu7nNbbaz6y1TVmqrqr6r+vr6+XoYuSerSy91EAS4FtlTVn3ctWg8M3xG0Ari+q35mu6toMfBMO520AViS5JB24XhJq0mSpsmsHvq+Dfht4N4kd7fax4ELgWuSrAQeBd7blt0InAoMAs8CZwNU1fYknwDuaO3Or6rtPYxLkjRBkw6DqvrfQMZYfNIo7Qs4Z4x1rQXWTnYskqTe+A1kSZJhIEkyDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIk9KAySLE3yQJLBJOfO9HgkaV+yR4RBkv2AzwCnAIuAM5IsmtlRSdK+Y48IA+AEYLCqHqqq54GrgGUzPCZJ2mekqmZ6DCQ5DVhaVf+2zf82cGJVfXBEu1XAqjb7VuCBaR3o1DkM+MFMD2Iv5v7rjfuvN3v7/ntzVfWNLM6aiZFMVlWtAdbM9Dh6lWSgqvpnehx7K/dfb9x/vXm17r895TTRNmBe1/zcVpMkTYM9JQzuABYmWZBkf+B0YP0Mj0mS9hl7xGmiqtqZ5IPABmA/YG1VbZ7hYe1Oe/2prhnm/uuN+683r8r9t0dcQJYkzaw95TSRJGkGGQaSJMNgIpIsT1JJfn6M5V9LMqlbzpK859X8MxxJDk/yt0keSrIpya1JfmOKt/Gq2odJ3pjk7vZ4Ism2Nv10kvt3w/belOTaqV7vTEvyc0muSvLd9t67Mck/HaPtsUlOneR29ur95zWDCUhyNfAm4OaqWj3K8q8BH6mqgeke254sSYC/B9ZV1V+12puB91TVX8zo4PYSSf4Y+D9V9WdJ5gNfqqqjZ3ZUe74x3nu/CBxUVd8Ypf1ZQP/IL7zuCzwyGKckBwK/DKykc+srSV7XPnFsSfJF4HVd7Ze0T793JvmfrT9JHknyJ61+7/BRRpKzkvxlm74sycVJ/r59kj6ta73/KckdSe5J8ifTtwd68g7g+eF/jABV9WhV/UX36wZI8qUkb2/TlyQZSLK5+7Xuo/twpP2S/HXbN19t78W3JLlzuEGShcPzbZ99sh1ZDCQ5LsmG9mn537U285Pc16bPSnJdkq8keTDJf+1a76jv7T3UrwM/GfHe+zbwgSTLh2tJrkiyDDgfeF/bT+9LcmiSv2vvlduSHNPa/1rXUdtdSd6wt+8/w2D8lgFfqap/AJ5Kcjzwu8CzVfXPgNXA8QBJDgP+M/DOqjoOGAA+3LWuH7T6JcBHxtjeEXTC593AhW29S4CFdH7L6Vjg+CS/OpUvcjc5Crhzl61e7g/bNz2PAX5t+B9is6/tw5EWAp+pqqOAp4HfrKrvAs8kOba1ORv4m64+36uqY4FvAJcBpwGLgbEC8VjgfcAv0PkDOW8c7+09zdHAplHqlwJnASQ5GPiXwA3AHwFXV9WxVXU1nX1zV1UdA3wcuLz1/whwTtufvwL831G2cSx70f7bI75nsJc4A/h0m76qzR8JXAxQVfckuactX0zn11e/2TlKZX/g1q51XdeeNwH/aozt/V1V/RS4P8nhrbakPe5q8wfS+aPw9cm/rOmX5DN0/kg/T+fXasfy3nR+j2oWnT/si4DhfbxP70Pg4aq6u01vAua36c8BZyf5MJ0/RCd09Rn+Iue9wIFV9SPgR0meSzJ7lG3cVFXPAKRzjeLNwGxe+b29V6iq/5Xks0n6gN8EvtC+7zSy6S+35VTVzelcxzkI+Cbw50muAK6rqq2j9N2r9p9hMA5JDqVzquMXkhSdL8YVL/5BeVkXYGNVnTHG8ufa8wuM/d/gua7pdD1/sqr+x3jHvofYTPsHBVBV57RPSAPATl56hPpagCQL6Hz6+udVtSPJZcPLmn1tH47U/dpe4MVTlF+gc5R6M7Cpqp4apc9PR/T/KaPvw5HbmMWu39t7ms10joBGcznwb+ic9j17IiutqguT3ACcSucP+8nAj0c026v2n6eJxuc04PNV9eaqml9V84CH6Xwi+9cASY6mczoD4DbgbUmObMtenzHuXpigDcDv5MXrD3OS/OwUrHd3uxl4bZLf7aod0J4fAY5N8jNJ5vHiJ9mDgH+kc9rjcDr/r4upsLfuw3Gpqh/TeY2X8NJTRFNld723d5ebgde0I0wAkhyT5FfonCr7EEBVDd+d9SPgDV39vwG8v/V7O53Tkz9M8paqureqPkXn53RGvcNwFHvs/jMMxucM4Isjal8AFgAHJtlC58LTJoCqGqJzPvLKduroVsb/ZhlTVX0V+Fvg1iT3Atfy0jfuHqk6t6wtp3Pe/+Ek3wLWAR+jc7j9MHA/nVNud7Y+36Zz5PUdOq/5m1M0lr1yH07QFXQ+7X91qle8u97bu0t77/0G8M52sXwz8Engiar6PrCFl4bmLcCi4QvIwB/Tua50D53rTitauw8lua/VfwJ8eZzj2WP3n7eWSq8yST4CHFxV/2Wmx7InS3IAnesnxw2f29+Xec1AehVJ5xbnt9C5xqUxJHknnTuKLjIIOjwykCR5zUCSZBhIkjAMJEkYBpIkDANJEvD/AG3vovyAnWp7AAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"---------- ----- ----------- ------\n",
"total base 42483 Percentage 100%\n",
"Adenine: 12686 Percentage: 29.86%\n",
"Guanine: 8394 Percentage: 19.76%\n",
"Thynime: 13713 Percentage: 32.28%\n",
"Cytosine: 7690 Percentage: 18.1%\n",
"---------- ----- ----------- ------\n",
"GC content: 37.86 %\n"
]
}
],
"source": [
"basic_Analysis(sequence2_All)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "2b164b49",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" line no | \n",
" sequence ratios | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 607 | \n",
" 608 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 608 | \n",
" 609 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 609 | \n",
" 610 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 610 | \n",
" 611 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 611 | \n",
" 612 | \n",
" 0.989474 | \n",
"
\n",
" \n",
"
\n",
"
612 rows × 2 columns
\n",
"
"
],
"text/plain": [
" line no sequence ratios\n",
"0 1 1.000000\n",
"1 2 1.000000\n",
"2 3 1.000000\n",
"3 4 1.000000\n",
"4 5 1.000000\n",
".. ... ...\n",
"607 608 1.000000\n",
"608 609 1.000000\n",
"609 610 1.000000\n",
"610 611 1.000000\n",
"611 612 0.989474\n",
"\n",
"[612 rows x 2 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_sequence"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "5e3758a9",
"metadata": {},
"outputs": [],
"source": [
"def Fapriori(itemSetList, minSup, minConf):\n",
" C1ItemSet = getItemSetFromList(itemSetList)\n",
" # Final result global frequent itemset\n",
" globalFreqItemSet = dict()\n",
" # Storing global itemset with support count\n",
" globalItemSetWithSup = defaultdict(int)\n",
"\n",
" L1ItemSet = getAboveMinSup(\n",
" C1ItemSet, itemSetList, minSup, globalItemSetWithSup)\n",
" currentLSet = L1ItemSet\n",
" k = 2\n",
"\n",
" # Calculating frequent item set\n",
" while(currentLSet):\n",
" # Storing frequent itemset\n",
" globalFreqItemSet[k-1] = currentLSet\n",
" # Self-joining Lk\n",
" candidateSet = getUnion(currentLSet, k)\n",
" # Perform subset testing and remove pruned supersets\n",
" candidateSet = pruning(candidateSet, currentLSet, k-1)\n",
" # Scanning itemSet for counting support\n",
" currentLSet = getAboveMinSup(\n",
" candidateSet, itemSetList, minSup, globalItemSetWithSup)\n",
" k += 1\n",
"\n",
" rules = FassociationRule(globalFreqItemSet, globalItemSetWithSup, minConf)\n",
" rules.sort(key=lambda x: x[2])\n",
" return globalFreqItemSet, rules"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "88d8387a",
"metadata": {},
"outputs": [],
"source": [
"def FaprioriFromFile(fname, minSup, minConf):\n",
" C1ItemSet, itemSetList = getFromFile(fname)\n",
"\n",
" # Final result global frequent itemset\n",
" globalFreqItemSet = dict()\n",
" # Storing global itemset with support count\n",
" globalItemSetWithSup = defaultdict(int)\n",
"\n",
" L1ItemSet = getAboveMinSup(\n",
" C1ItemSet, itemSetList, minSup, globalItemSetWithSup)\n",
" currentLSet = L1ItemSet\n",
" k = 2\n",
"\n",
" # Calculating frequent item set\n",
" while(currentLSet):\n",
" # Storing frequent itemset\n",
" globalFreqItemSet[k-1] = currentLSet\n",
" # Self-joining Lk\n",
" candidateSet = getUnion(currentLSet, k)\n",
" # Perform subset testing and remove pruned supersets\n",
" candidateSet = pruning(candidateSet, currentLSet, k-1)\n",
" # Scanning itemSet for counting support\n",
" currentLSet = getAboveMinSup(\n",
" candidateSet, itemSetList, minSup, globalItemSetWithSup)\n",
" k += 1\n",
"\n",
" rules = associationRule(globalFreqItemSet, globalItemSetWithSup, minConf)\n",
" rules.sort(key=lambda x: x[2])\n",
"\n",
" return globalFreqItemSet, rules"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c8bd8453",
"metadata": {},
"outputs": [],
"source": [
"def powerset(s):\n",
" return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "04a9b357",
"metadata": {},
"outputs": [],
"source": [
"def getFromFile(fname):\n",
" itemSets = []\n",
" itemSet = set()\n",
"\n",
" with open(fname, 'r') as file:\n",
" csv_reader = reader(file)\n",
" for line in csv_reader:\n",
" line = list(filter(None, line))\n",
" record = set(line)\n",
" for item in record:\n",
" itemSet.add(frozenset([item]))\n",
" itemSets.append(record)\n",
" return itemSet, itemSets "
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "e61fb8d0",
"metadata": {},
"outputs": [],
"source": [
"def getAboveMinSup(itemSet, itemSetList, minSup, globalItemSetWithSup):\n",
" freqItemSet = set()\n",
" localItemSetWithSup = defaultdict(int)\n",
"\n",
" for item in itemSet:\n",
" for itemSet in itemSetList:\n",
" if item.issubset(itemSet):\n",
" globalItemSetWithSup[item] += 1\n",
" localItemSetWithSup[item] += 1\n",
"\n",
" for item, supCount in localItemSetWithSup.items():\n",
" support = float(supCount / len(itemSetList))\n",
" if(support >= minSup):\n",
" freqItemSet.add(item)\n",
"\n",
" return freqItemSet"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "4d46b1b7",
"metadata": {},
"outputs": [],
"source": [
"def getUnion(itemSet, length):\n",
" return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c4288f6d",
"metadata": {},
"outputs": [],
"source": [
"def pruning(candidateSet, prevFreqSet, length):\n",
" tempCandidateSet = candidateSet.copy()\n",
" for item in candidateSet:\n",
" subsets = combinations(item, length)\n",
" for subset in subsets:\n",
" # if the subset is not in previous K-frequent get, then remove the set\n",
" if(frozenset(subset) not in prevFreqSet):\n",
" tempCandidateSet.remove(item)\n",
" break\n",
" return tempCandidateSet"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "4152e510",
"metadata": {},
"outputs": [],
"source": [
"def FassociationRule(freqItemSet, itemSetWithSup, minConf):\n",
" rules = []\n",
" for k, itemSet in freqItemSet.items():\n",
" for item in itemSet:\n",
" subsets = powerset(item)\n",
" for s in subsets:\n",
" confidence = float(\n",
" itemSetWithSup[item] / itemSetWithSup[frozenset(s)])\n",
" if(confidence > minConf):\n",
" rules.append([set(s), set(item.difference(s)), confidence])\n",
" return rules"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "1b2c0b08",
"metadata": {},
"outputs": [],
"source": [
"def getItemSetFromList(itemSetList):\n",
" tempItemSet = set()\n",
"\n",
" for itemSet in itemSetList:\n",
" for item in itemSet:\n",
" tempItemSet.add(frozenset([item]))\n",
"\n",
" return tempItemSet"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "e3e50e57",
"metadata": {},
"outputs": [],
"source": [
"g_sequence1=list()\n",
"g_sequence2=list()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "c3b0cf11",
"metadata": {},
"outputs": [],
"source": [
"count=1"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "fe191c97",
"metadata": {},
"outputs": [],
"source": [
"with open(input_file1) as file1:\n",
" for lineno in file1:\n",
" if count==3 or count==70 or count ==94 or count==115 or count==130 or count==139 or count==328 or count==415:\n",
" lineno = list(lineno)\n",
" lineno.remove(\"\\n\")\n",
" g_sequence1.append(lineno)\n",
" count+=1\n",
" else:\n",
" count+=1"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "375b500c",
"metadata": {},
"outputs": [],
"source": [
"count1=1"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "d3275fdb",
"metadata": {},
"outputs": [],
"source": [
"with open(input_file2) as file2:\n",
" for lineno in file2:\n",
" if count1==3 or count1==70 or count1 ==94 or count1==115 or count1==130 or count1==139 or count1==328 or count1==415:\n",
" lineno = list(lineno)\n",
" lineno.remove(\"\\n\")\n",
" g_sequence2.append(lineno)\n",
" count1+=1\n",
" else:\n",
" count1+=1"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "880356a8",
"metadata": {},
"outputs": [],
"source": [
"freqItemSet, rules = Fapriori(g_sequence1, minSup=.5, minConf=.5)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "4bfa662f",
"metadata": {},
"outputs": [],
"source": [
"freqItemSet1, rules1 = Fapriori(g_sequence2, minSup=.5, minConf=.5)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "beecdaae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{1: {frozenset({'T'}), frozenset({'C'}), frozenset({'G'}), frozenset({'A'})}, 2: {frozenset({'G', 'C'}), frozenset({'G', 'A'}), frozenset({'C', 'A'}), frozenset({'T', 'C'}), frozenset({'G', 'T'}), frozenset({'T', 'A'})}, 3: {frozenset({'T', 'C', 'A'}), frozenset({'G', 'C', 'T'}), frozenset({'G', 'C', 'A'}), frozenset({'G', 'T', 'A'})}, 4: {frozenset({'G', 'C', 'A', 'T'})}} [[{'G'}, {'C'}, 1.0], [{'C'}, {'G'}, 1.0], [{'G'}, {'A'}, 1.0], [{'A'}, {'G'}, 1.0], [{'C'}, {'A'}, 1.0], [{'A'}, {'C'}, 1.0], [{'T'}, {'C'}, 1.0], [{'C'}, {'T'}, 1.0], [{'G'}, {'T'}, 1.0], [{'T'}, {'G'}, 1.0], [{'T'}, {'A'}, 1.0], [{'A'}, {'T'}, 1.0], [{'T'}, {'C', 'A'}, 1.0], [{'C'}, {'T', 'A'}, 1.0], [{'A'}, {'T', 'C'}, 1.0], [{'T', 'C'}, {'A'}, 1.0], [{'T', 'A'}, {'C'}, 1.0], [{'C', 'A'}, {'T'}, 1.0], [{'G'}, {'T', 'C'}, 1.0], [{'C'}, {'G', 'T'}, 1.0], [{'T'}, {'G', 'C'}, 1.0], [{'G', 'C'}, {'T'}, 1.0], [{'G', 'T'}, {'C'}, 1.0], [{'T', 'C'}, {'G'}, 1.0], [{'G'}, {'C', 'A'}, 1.0], [{'C'}, {'G', 'A'}, 1.0], [{'A'}, {'G', 'C'}, 1.0], [{'G', 'C'}, {'A'}, 1.0], [{'G', 'A'}, {'C'}, 1.0], [{'C', 'A'}, {'G'}, 1.0], [{'G'}, {'T', 'A'}, 1.0], [{'T'}, {'G', 'A'}, 1.0], [{'A'}, {'G', 'T'}, 1.0], [{'G', 'T'}, {'A'}, 1.0], [{'G', 'A'}, {'T'}, 1.0], [{'T', 'A'}, {'G'}, 1.0], [{'G'}, {'T', 'C', 'A'}, 1.0], [{'C'}, {'G', 'T', 'A'}, 1.0], [{'A'}, {'G', 'C', 'T'}, 1.0], [{'T'}, {'G', 'C', 'A'}, 1.0], [{'G', 'C'}, {'T', 'A'}, 1.0], [{'G', 'A'}, {'T', 'C'}, 1.0], [{'G', 'T'}, {'C', 'A'}, 1.0], [{'C', 'A'}, {'G', 'T'}, 1.0], [{'T', 'C'}, {'G', 'A'}, 1.0], [{'T', 'A'}, {'G', 'C'}, 1.0], [{'G', 'C', 'A'}, {'T'}, 1.0], [{'G', 'C', 'T'}, {'A'}, 1.0], [{'G', 'T', 'A'}, {'C'}, 1.0], [{'T', 'C', 'A'}, {'G'}, 1.0]]\n"
]
}
],
"source": [
"print(freqItemSet,rules)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "bd4004c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{1: {frozenset({'T'}), frozenset({'C'}), frozenset({'G'}), frozenset({'A'})}, 2: {frozenset({'G', 'C'}), frozenset({'G', 'A'}), frozenset({'C', 'A'}), frozenset({'T', 'C'}), frozenset({'G', 'T'}), frozenset({'T', 'A'})}, 3: {frozenset({'T', 'C', 'A'}), frozenset({'G', 'C', 'T'}), frozenset({'G', 'C', 'A'}), frozenset({'G', 'T', 'A'})}, 4: {frozenset({'G', 'C', 'A', 'T'})}} [[{'G'}, {'C'}, 1.0], [{'C'}, {'G'}, 1.0], [{'G'}, {'A'}, 1.0], [{'A'}, {'G'}, 1.0], [{'C'}, {'A'}, 1.0], [{'A'}, {'C'}, 1.0], [{'T'}, {'C'}, 1.0], [{'C'}, {'T'}, 1.0], [{'G'}, {'T'}, 1.0], [{'T'}, {'G'}, 1.0], [{'T'}, {'A'}, 1.0], [{'A'}, {'T'}, 1.0], [{'T'}, {'C', 'A'}, 1.0], [{'C'}, {'T', 'A'}, 1.0], [{'A'}, {'T', 'C'}, 1.0], [{'T', 'C'}, {'A'}, 1.0], [{'T', 'A'}, {'C'}, 1.0], [{'C', 'A'}, {'T'}, 1.0], [{'G'}, {'T', 'C'}, 1.0], [{'C'}, {'G', 'T'}, 1.0], [{'T'}, {'G', 'C'}, 1.0], [{'G', 'C'}, {'T'}, 1.0], [{'G', 'T'}, {'C'}, 1.0], [{'T', 'C'}, {'G'}, 1.0], [{'G'}, {'C', 'A'}, 1.0], [{'C'}, {'G', 'A'}, 1.0], [{'A'}, {'G', 'C'}, 1.0], [{'G', 'C'}, {'A'}, 1.0], [{'G', 'A'}, {'C'}, 1.0], [{'C', 'A'}, {'G'}, 1.0], [{'G'}, {'T', 'A'}, 1.0], [{'T'}, {'G', 'A'}, 1.0], [{'A'}, {'G', 'T'}, 1.0], [{'G', 'T'}, {'A'}, 1.0], [{'G', 'A'}, {'T'}, 1.0], [{'T', 'A'}, {'G'}, 1.0], [{'G'}, {'T', 'C', 'A'}, 1.0], [{'C'}, {'G', 'T', 'A'}, 1.0], [{'A'}, {'G', 'C', 'T'}, 1.0], [{'T'}, {'G', 'C', 'A'}, 1.0], [{'G', 'C'}, {'T', 'A'}, 1.0], [{'G', 'A'}, {'T', 'C'}, 1.0], [{'G', 'T'}, {'C', 'A'}, 1.0], [{'C', 'A'}, {'G', 'T'}, 1.0], [{'T', 'C'}, {'G', 'A'}, 1.0], [{'T', 'A'}, {'G', 'C'}, 1.0], [{'G', 'C', 'A'}, {'T'}, 1.0], [{'G', 'C', 'T'}, {'A'}, 1.0], [{'G', 'T', 'A'}, {'C'}, 1.0], [{'T', 'C', 'A'}, {'G'}, 1.0]]\n"
]
}
],
"source": [
"print(freqItemSet1,rules1)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "5d6d2717",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(freqItemSet1)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "016bf481",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"IOPub data rate exceeded.\n",
"The notebook server will temporarily stop sending output\n",
"to the client in order to avoid crashing it.\n",
"To change this limit, set the config variable\n",
"`--NotebookApp.iopub_data_rate_limit`.\n",
"\n",
"Current values:\n",
"NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
"NotebookApp.rate_limit_window=3.0 (secs)\n",
"\n"
]
}
],
"source": [
"freqItemSet, rules = fpgrowth(g_sequence1,minSupRatio=0.5,minConf=0.5)\n",
"print(rules) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "482b6678",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}