#!/usr/bin/env python """ Script to obtain non-redundant chains """ import os import re import collections import sys import argparse import re import logging sys.path.append('/usr/local/lib/python2.7/site-packages') from Bio.PDB.MMCIFParser import MMCIFParser from Bio.PDB.PDBParser import PDBParser from Bio.PDB import PDBIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict trans = {'ALA':'A','CYS':'C','ASP':'D','GLU':'E','PHE':'F','GLY':'G','HIS':'H','ILE':'I','LYS':'K','LEU':'L','MET':'M','ASN':'N','PRO':'P','GLN':'Q','ARG':'R','SER':'S','THR':'T','VAL':'V','TRP':'W','TYR':'Y','EOF':''} strand = {'A':'ALA','C':'CYS','D':'ASP','E':'GLU','F':'PHE','G':'GLY','H':'HIS','I':'ILE','K':'LYS','L':'LEU','M':'MET','N':'ASN','P':'PRO','Q':'GLN','R':'ARG','S':'SER','T':'THR','V':'VAL','W':'TRP','Y':'TYR'} ls_chains=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z','a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] def get_structure_nr_mmcif(infile,nr_chains): pdbid = infile.replace(".cif","") parser = MMCIFParser(QUIET = True) #structure = parser.get_structure('4v40', '4v40.cif') structure = parser.get_structure(pdbid, infile) for chain in structure.get_chains(): if chain.get_id() in sorted(nr_chains.keys()): #print chain.get_id() #outtargetfile="{}_{}_{}".format(pdbid,chain.get_id(),nr_chains[chain.get_id()][0])+".pdb" herr="{}_{}".format(chain.get_id(),nr_chains[chain.get_id()][0]) outtargetfile=infile.replace(".cif","_"+herr+"__.pdb") io = PDBIO() chain.id = nr_chains[chain.get_id()][0] io.set_structure(chain) io.save(outtargetfile) """ def get_structure_nr_mmcif(infile,nr_chains): pdbid = infile.replace(".cif","") parser = MMCIFParser(QUIET = True) #structure = parser.get_structure('4v40', '4v40.cif') structure = parser.get_structure(pdbid, infile) outtargetfile="test/{}".format(pdbid)+".pdb" io = PDBIO() for chain in structure.get_chains(): if chain.get_id() in sorted(nr_chains.keys()): #print chain.get_id() chain.id = nr_chains[chain.get_id()][0] io.set_structure(chain) io.save(outtargetfile) """ def get_structure_nr_pdb(infile,nr_chains): pdbid = infile.replace(".pdb","") parser = PDBParser(QUIET = True) #structure = parser.get_structure('4v40', '4v40.cif') structure = parser.get_structure(pdbid, infile) for chain in structure.get_chains(): if chain.get_id() in sorted(nr_chains.keys()): #print chain.get_id() herr="{}_{}".format(chain.get_id(),nr_chains[chain.get_id()][0]) outtargetfile=infile.replace(".pdb","_"+herr+"__.pdb") io = PDBIO() chain.id = nr_chains[chain.get_id()][0] io.set_structure(chain) io.save(outtargetfile) def recreate_nr_chains_cif(infile): cif_lines=[line.replace("\n","") for line in open(infile,"r").readlines() if line[:4]=="ATOM" if line.split("?")[-1].split()[1] in trans.keys() if line.split("?")[-1].split()[3]=="CA"]#+[line.replace("\n","") for line in open(cif_file,"r").readlines() if "ATOM" in line] perchain_dict=collections.defaultdict(list) for line in cif_lines: perchain_dict[line.split("?")[-1].split()[2]].append(line.split("?")[-1]) persequence_dict=collections.defaultdict(list) for chain in sorted(perchain_dict.keys()): seqs="".join([trans[li.split()[1]] for li in perchain_dict[chain]]) persequence_dict[seqs].append(chain) if len(persequence_dict.keys())<63: nr_chains_dict=collections.defaultdict(list) allseqssumoutput = open("{}_lognr.txt".format(infile.replace(".cif","")),"a") print>>allseqssumoutput, "Representative chains:" for index,ch in enumerate(persequence_dict.keys()): if len(persequence_dict[ch][0])==1: nr_chains_dict[persequence_dict[ch][0]].append(persequence_dict[ch][0]) if len(persequence_dict[ch])==1: print>>allseqssumoutput, "Chain {}".format(persequence_dict[ch][0]) if len(persequence_dict[ch])>1: print>>allseqssumoutput, "Chain {}".format(persequence_dict[ch][0])+" ["+",".join(sorted(persequence_dict[ch]))+"]" else: nr_chains_dict[persequence_dict[ch][0]].append(ls_chains[index]) if len(persequence_dict[ch])==1: print>>allseqssumoutput, "Chain {}, rename to Chain {}".format(persequence_dict[ch][0],ls_chains[index]) if len(persequence_dict[ch])>1: print>>allseqssumoutput, "Chain {}, rename to Chain {}".format(persequence_dict[ch][0],ls_chains[index])+" ["+",".join(sorted(persequence_dict[ch]))+"]" get_structure_nr_mmcif(infile,nr_chains_dict) def recreate_nr_chains_pdb(infile): pdb_lines=[line for line in open(infile,"r").readlines() if line[:4]=="ATOM" and line[13:15]=="CA"] perchain_dict=collections.defaultdict(list) for line in pdb_lines: perchain_dict[line[21:22]].append(line) persequence_dict=collections.defaultdict(list) nr_chains_dict=collections.defaultdict(list) allseqssumoutput = open("{}_lognr.txt".format(infile.replace(".pdb","")),"a") print>>allseqssumoutput, "Representative chains:" for chain in sorted(perchain_dict.keys()): seqs="".join([trans[li[17:20]] for li in perchain_dict[chain]]) persequence_dict[seqs].append(chain) for seq in persequence_dict.keys(): if len(persequence_dict[seq])==1: print>>allseqssumoutput, "Chain {}".format(persequence_dict[seq][0]) if len(persequence_dict[seq])>1: print>>allseqssumoutput, "Chain {}".format(persequence_dict[seq][0])+" ["+",".join(sorted(persequence_dict[seq]))+"]" for ch in persequence_dict.keys(): nr_chains_dict[persequence_dict[ch][0]].append(persequence_dict[ch][0]) get_structure_nr_pdb(infile,nr_chains_dict) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Get non-redundant chains') parser.add_argument("inputfile",help="pdb or mmCIF input file") parser.add_argument("-v","--verbose", help="Long messages", dest="verbose",default=False, action="store_true") args = parser.parse_args() logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG if args.verbose else logging.WARN) infile = args.inputfile if ".pdb" in infile: recreate_nr_chains_pdb(infile) if ".cif" in infile: recreate_nr_chains_cif(infile)