Two programs given below, residue_filter.pl and create_residue_potential.pl. The latter program calls the former one as well as the Windows version of Qhull (www.qhull.org). The former program requires access to every PDB file listed in file 1417proteinlist.txt, which is read in by the latter program. Place the programs in separate files with their respective names, and download Qhull for Windows, as well as the 1417 PDB files. PROGRAM residue_filter.pl #!/usr/bin/perl #Majid Masso #School of Systems Biology, George Mason University #April 2013 #Filter in CA ATOM lines of protein chain #Model 1 selected in NMR files, and one atomic alternate location letter used #Filter out chains with one or more non-standard amino acids #Can run as stand-alone with or without creation of an output file #Used by all other programs to create a filtered PDB file prior to their implementation #Output is directly piped into the other programs without creating an output file $arg=0; for($i=0;$i<@ARGV;$i++){ if($ARGV[$i] eq "-e"){ $pdbfile=$ARGV[$i+1]; $arg++; } elsif($ARGV[$i] eq "-o"){ $outfile=$ARGV[$i+1]; } } if($arg!=1){die "Usage: perl residue_filter.pl -e -o \n";} $pdb=lc(substr($pdbfile,0,4)); $mid=substr($pdb,1,2); $chain=uc(substr($pdbfile,4,1)); open(PDB,"pdb$pdb.ent") or die "Cannot locate PDB file $pdb\n"; if($chain eq ''){die "Missing a protein chain letter at the end of the PDB code\n";} @allres=(ALA,CYS,ASP,GLU,PHE,GLY,HIS,ILE,LYS,LEU,MET,ASN,PRO,GLN,ARG,SER,THR,VAL,TRP,TYR);$c=0;$d=0; if($outfile eq ''){ while(){ chomp($_); $at[$c]=substr($_,13,2);$altloc[$c]=substr($_,16,1);$aa[$c]=substr($_,17,3);$chn[$c]=substr($_,21,1);$resnum[$c]=substr($_,22,4);$ctr=0; if(($_=~/^ATOM/ || $_=~/^HETATM/) && ($chain eq $chn[$c] || ($chain eq '@' || $chn[$c] eq ' ')) && (($altloc[$c] eq ' ' || $altloc[$c] eq '1') || ($altloc[$c] eq 'A' || $altloc[$c] eq 'D')) && $at[$c] eq 'CA'){ unless($c==0){ $value = $resnum[$c]-$resnum[$c-1]; if($value > 1){die "Cannot generate predictions due to gap in the protein structure after sequence position number $resnum[$c-1]\n";} } for($i=0;$i<20;$i++){ if($aa[$c] eq $allres[$i]){ $ctr++; last; } } if($ctr==0){die "Cannot generate predictions due to non-standard amino acid residue $aa[$c] at position number $resnum[$c]\n";} $line[$c]=$_; $c++; } if($_=~/^ENDMDL/ || ($_=~/^TER/ && $aa[$c] eq $aa[$c-1] && ($chain eq $chn[$c] || ($chain eq '@' || $chn[$c] eq ' ')) && $resnum[$c]==$resnum[$c-1])){last;} if($_=~/^HELIX/ || $_=~/^SHEET/){$ssline[$d]=$_;$d++;} } close(PDB); for($k=0;$k<$d;$k++){print "$ssline[$k]\n";} for($j=0;$j<$c;$j++){print "$line[$j]\n";} } else{ open(OUT,">$outfile"); while(){ chomp($_); $at[$c]=substr($_,13,2);$altloc[$c]=substr($_,16,1);$aa[$c]=substr($_,17,3);$chn[$c]=substr($_,21,1);$resnum[$c]=substr($_,22,4);$ctr=0; if(($_=~/^ATOM/ || $_=~/^HETATM/) && ($chain eq $chn[$c] || ($chain eq '@' || $chn[$c] eq ' ')) && (($altloc[$c] eq ' ' || $altloc[$c] eq '1') || ($altloc[$c] eq 'A' || $altloc[$c] eq 'D')) && $at[$c] eq 'CA'){ unless($c==0){ $value = $resnum[$c]-$resnum[$c-1]; if($value > 1){die "Cannot generate predictions due to gap in the protein structure after sequence position number $resnum[$c-1]\n";} } for($i=0;$i<20;$i++){ if($aa[$c] eq $allres[$i]){ $ctr++; last; } } if($ctr==0){die "Cannot generate predictions due to non-standard amino acid residue $aa[$c] at position number $resnum[$c]\n";} $line[$c]=$_; $c++; } if($_=~/^ENDMDL/ || ($_=~/^TER/ && $aa[$c] eq $aa[$c-1] && ($chain eq $chn[$c] || ($chain eq '@' || $chn[$c] eq ' ')) && $resnum[$c]==$resnum[$c-1])){last;} if($_=~/^HELIX/ || $_=~/^SHEET/){$ssline[$d]=$_;$d++;} } close(PDB); for($k=0;$k<$d;$k++){print OUT "$ssline[$k]\n";} for($j=0;$j<$c;$j++){print OUT "$line[$j]\n";} } close(OUT); ------------------------------------------------------------------------ PROGRAM create_residue_potential.pl #!/usr/bin/perl #Majid Masso #School of Systems Biology, George Mason University #April 2013 #4-body potential using a 20-letter residue alphabet, nocutoff or with cutoff (edit code) $cutoff=12.0; %reslist=('A'=>'ALA','C'=>'CYS','D'=>'ASP','E'=>'GLU','F'=>'PHE','G'=>'GLY','H'=>'HIS','I'=>'ILE','K'=>'LYS','L'=>'LEU','M'=>'MET','N'=>'ASN','P'=>'PRO','Q'=>'GLN','R'=>'ARG','S'=>'SER','T'=>'THR','V'=>'VAL','W'=>'TRP','Y'=>'TYR'); @ressums=(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); @at=sort(keys(%reslist));$a=0;$tabs=1; for($i=0;$i<20;$i++){ for($j=$i;$j<20;$j++){ for($k=$j;$k<20;$k++){ for($l=$k;$l<20;$l++){ $quad[$a]=$at[$i].$at[$j].$at[$k].$at[$l]; if($j>$i && $k>$j && $l>$k){$perm[$a]=24;} elsif(($j==$i && $k>$j && $l>$k) || ($j>$i && $k==$j && $l>$k) || ($j>$i && $k>$j && $l==$k)){$perm[$a]=12;} elsif($j==$i && $k>$j && $l==$k){$perm[$a]=6;} elsif(($j==$i && $k==$j && $l>$k) || ($j>$i && $k==$j && $l==$k)){$perm[$a]=4;} elsif($j==$i && $k==$j && $l==$k){$perm[$a]=1;} $a++; } } } } for($i=0;$i<$a;$i++){ $freqs[$i]=0;} open(LIST, "1417proteinlist.txt"); while(){ chomp($_); $len=length($_); if($len==5){ $pdb=substr($_,0,4); $chain=substr($_,4,1); $subdir=substr($pdb,1,2);} else{ die "Chain not provided for $_\n";} @atoms=(); @coords=(); $ct=0; open(FILEA, "perl residue_filter.pl -e $_ |"); while(){ unless($_=~/^HELIX/ || $_=~/^SHEET/){ $chn=substr($_,21,1);$aa=substr($_,17,3);$resnum=substr($_,22,4);$x=substr($_,30,8);$y=substr($_,38,8);$z=substr($_,46,8); $ct++; @point=($x,$y,$z); push(@coords,[ @point ]); if($aa eq $reslist{"A"}){$ressums[0]++; push(@atoms,('A'));} elsif($aa eq $reslist{"C"}){$ressums[1]++; push(@atoms,('C'));} elsif($aa eq $reslist{"D"}){$ressums[2]++; push(@atoms,('D'));} elsif($aa eq $reslist{"E"}){$ressums[3]++; push(@atoms,('E'));} elsif($aa eq $reslist{"F"}){$ressums[4]++; push(@atoms,('F'));} elsif($aa eq $reslist{"G"}){$ressums[5]++; push(@atoms,('G'));} elsif($aa eq $reslist{"H"}){$ressums[6]++; push(@atoms,('H'));} elsif($aa eq $reslist{"I"}){$ressums[7]++; push(@atoms,('I'));} elsif($aa eq $reslist{"K"}){$ressums[8]++; push(@atoms,('K'));} elsif($aa eq $reslist{"L"}){$ressums[9]++; push(@atoms,('L'));} elsif($aa eq $reslist{"M"}){$ressums[10]++; push(@atoms,('M'));} elsif($aa eq $reslist{"N"}){$ressums[11]++; push(@atoms,('N'));} elsif($aa eq $reslist{"P"}){$ressums[12]++; push(@atoms,('P'));} elsif($aa eq $reslist{"Q"}){$ressums[13]++; push(@atoms,('Q'));} elsif($aa eq $reslist{"R"}){$ressums[14]++; push(@atoms,('R'));} elsif($aa eq $reslist{"S"}){$ressums[15]++; push(@atoms,('S'));} elsif($aa eq $reslist{"T"}){$ressums[16]++; push(@atoms,('T'));} elsif($aa eq $reslist{"V"}){$ressums[17]++; push(@atoms,('V'));} elsif($aa eq $reslist{"W"}){$ressums[18]++; push(@atoms,('W'));} elsif($aa eq $reslist{"Y"}){$ressums[19]++; push(@atoms,('Y'));} } } close(FILEA); open(INPUT, ">dinputs.txt"); print INPUT "3\n$ct\n"; for($i=0;$i<$ct;$i++){ print INPUT "@{@coords[$i]}\n"; } system("qhull d Qbb < dinputs.txt i > doutputs.txt"); close(INPUT); open(FILEB, "doutputs.txt"); while(){ chomp($_); @inds=split(/ /,$_); $indsize=@inds; if($indsize==4 && sqrt(($coords[$inds[0]][0]-$coords[$inds[1]][0])**2+($coords[$inds[0]][1]-$coords[$inds[1]][1])**2+($coords[$inds[0]][2]-$coords[$inds[1]][2])**2) <=$cutoff && sqrt(($coords[$inds[0]][0]-$coords[$inds[2]][0])**2+($coords[$inds[0]][1]-$coords[$inds[2]][1])**2+($coords[$inds[0]][2]-$coords[$inds[2]][2])**2)<=$cutoff && sqrt(($coords[$inds[0]][0]-$coords[$inds[3]][0])**2+($coords[$inds[0]][1]-$coords[$inds[3]][1])**2+($coords[$inds[0]][2]-$coords[$inds[3]][2])**2)<=$cutoff && sqrt(($coords[$inds[1]][0]-$coords[$inds[2]][0])**2+($coords[$inds[1]][1]-$coords[$inds[2]][1])**2+($coords[$inds[1]][2]-$coords[$inds[2]][2])**2)<=$cutoff && sqrt(($coords[$inds[1]][0]-$coords[$inds[3]][0])**2+($coords[$inds[1]][1]-$coords[$inds[3]][1])**2+($coords[$inds[1]][2]-$coords[$inds[3]][2])**2)<=$cutoff && sqrt(($coords[$inds[2]][0]-$coords[$inds[3]][0])**2+($coords[$inds[2]][1]-$coords[$inds[3]][1])**2+($coords[$inds[2]][2]-$coords[$inds[3]][2])**2)<=$cutoff){ @letters=($atoms[$inds[0]],$atoms[$inds[1]],$atoms[$inds[2]],$atoms[$inds[3]]); @sorted=sort(@letters); $ngram=$sorted[0].$sorted[1].$sorted[2].$sorted[3]; for($i=0;$i<$a;$i++){ if($ngram eq $quad[$i]){ $freqs[$i]+=1; last; } } } } close(FILEB); print "$tabs\n";$tabs++; if($ct==0){die "PROBLEM HERE\n";} } close(LIST); $quadsum=0; for($i=0;$i<$a;$i++){ $quadsum+=$freqs[$i]; } for($i=0;$i<$a;$i++){ $relfreqs[$i]=$freqs[$i]/$quadsum; } $totalaas=0; for($i=0;$i<20;$i++){ $totalaas+=$ressums[$i]; } for($i=0;$i<20;$i++){ $aarelfreqs[$i]=$ressums[$i]/$totalaas; } $b=0; for($i=0;$i<20;$i++){ for($j=$i;$j<20;$j++){ for($k=$j;$k<20;$k++){ for($l=$k;$l<20;$l++){ $refdist[$b]=$perm[$b]*$aarelfreqs[$i]*$aarelfreqs[$j]*$aarelfreqs[$k]*$aarelfreqs[$l]; if($relfreqs[$b]!=0){ $potential[$b]=-log($relfreqs[$b]/$refdist[$b])/log(10);} else{$potential[$b]='--';} $b++; } } } } open(POT,">potential1417cut12data.txt"); print POT "Quad Quad_Absolute_Frequency Quad_Relative_Frequency Quad_Ref_Dist Potential\n"; for($i=0;$i<$b;$i++){ print POT "$quad[$i] $freqs[$i] $relfreqs[$i] $refdist[$i] $potential[$i]\n"; } close(POT);