{ "cells": [ { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "from Bio import SeqIO\n", "from Bio import AlignIO\n", "from Bio.Align import AlignInfo\n", "from Bio.Seq import Seq\n", "from collections import defaultdict\n", "import os\n", "import gzip\n", "import io\n", "import sys\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alignment length 1824\n" ] } ], "source": [ "## Align sequences\n", "#from Bio.Align.Applications import MuscleCommandline\n", "#muscle_cline = MuscleCommandline(input=\"/Users/gsalle/Documents/CYATHOMIX/WP3/Nemabiome/Strongylidae97_mt/Strongylidae97_mt_all.fasta\")\n", "#stdout, stderr = muscle_cline()\n", "#from io import StringIO\n", "#from Bio import AlignIO\n", "#align = AlignIO.read(StringIO(stdout), \"fasta\")\n", "#print(align)\n", "\n", "## Read alignment\n", "alignment = AlignIO.read(open(\"/Users/gsalle/Documents/CYATHOMIX/WP3/Nemabiome/Strongylidae97_mt/Strongylidae97_mt_all_muscle.fa\"), \"fasta\")\n", "print(\"Alignment length %i\" % alignment.get_alignment_length())\n", "#for record in alignment:\n", "# print(record.seq + \" \" + record.id)\n", "\n", "## Compute similarity across positions\n", "#align_array = np.array([list(rec) for rec in alignment], np.character)\n", "#for i in range(alignment.get_alignment_length()):\n", "# print(alignment[:, i])\n", " \n", "summary_align = AlignInfo.SummaryInfo(alignment)\n", "## Generate consensus sequence\n", "consensus = summary_align.dumb_consensus()\n", "\n", "#Now, we want to make the PSSM, but ignore any N ambiguity residues when calculating this:\n", "my_pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=[\"N\"])\n", "#print(my_pssm)\n", "#print(my_pssm[alignment.get_alignment_length()-2:])\n", "#print(my_pssm[0:])\n", "#print(my_pssm[1:])\n", "#print(my_pssm[2:])\n", "\n", "#df = pd.DataFrame()\n", "#If you will be working heavily with the columns, \n", "#you can tell NumPy to store the array by column (as in Fortran) rather than its default of by row (as in C):\n", "#align_array = np.array([list(rec) for rec in alignment], np.character, order=\"F\")\n", "#print(\"Array shape %i by %i\" % align_array.shape)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1823\n" ] } ], "source": [ "myvec = []\n", "print(alignment.get_alignment_length()-1)\n", "for i in range(alignment.get_alignment_length()-1):\n", " #print(my_pssm[i:][1]['-'])\n", " idc = my_pssm[i:][1]['-']\n", " #print(i,idc)\n", " if idc < 5:\n", " tempdic = my_pssm[i:][1]\n", " tempdic['Consensus'] = my_pssm[i:][0]\n", " tempdic['Position'] = i\n", " myvec.append(tempdic)\n", " del tempdic\n", "df = pd.DataFrame.from_dict(myvec)\n", "df.head()\n", "\n", "df.to_csv('/Users/gsalle/Documents/CYATHOMIX/WP3/Nemabiome/cox1_base_count.tsv', sep='\\t', index=False) " ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "## Compute similarity per position\n", "#import matplotlib.pyplot as plt\n", "#df2 = df.groupby(['Position'])['Position'].count().unstack('A')\n", "#df2.head()\n", "#df2[['A','C','G','T']].plot(kind='bar', stacked=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }