Source code for treebuild.tree_build

#! /usr/bin/env python
#
# Copyright (C) 2016 Jing Lu <ajingnk@gmail.com>
# License: Apache

# -*- coding: utf-8 -*-

# pylint: disable=too-few-public-methods

import datetime
import os
import subprocess
import shutil
from rdkit import Chem
from rdkit.Chem.Draw import MolToFile

from .types import FingerPrintType
from .util import ParseLigandFile, WriteJSON, WriteAsPHYLIPFormat, Dot2Dict, \
    WriteDotFile, RemoveBackSlash
from .model import IMG_DIR, SMILE_COLUMNNAME, RAPIDNJ_COMMAND, FILE_FORMAT, TMP_FOLDER

[docs]class TreeBuild: """ There are assumptions for the data format of the input file. It is very important to understand these assumptions: 1. potency (e.g. IC50/Ka/Ki) unit is nM 2. the file must have a id column, you can set the column name with id_column 3. the file must have a SMILES column, with 'Canonical_Smiles' as column name 4. the file must have at least one potency column (IC50/Ka/Ki). To build the tree 1. the identity column needs to be specified with id_column 2. a list of fingerprints and a list of properties need to be specified with rdkit 3. the directories for input and output file need to be specified """ def __init__(self, input_file, output_file, id_column, fps, properties): """Setting parameters to build the tree. :param input_file: input file is a tab delimited text file. :param output_file: output file is a json file :param id_column: the id for each column, which will shown as the identifier in the visualization. :param fps: a list of FingperPrintType :param properties: a list of PropertyType :return: void, the program will generate input file for the visualization. """ # initial setting self._RAPIDNJ_COMMAND = RAPIDNJ_COMMAND self._FILE_FORMAT = FILE_FORMAT # creating folders if not os.path.exists(TMP_FOLDER): os.makedirs(TMP_FOLDER) if not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) activities = properties["activities"] other_properties = properties["properties"] ext_links = properties["ext_links"] lig_dict = self.parse_lig_file(input_file, id_column) trees = dict() for fp in fps: assert isinstance(fp, FingerPrintType) trees[fp.name] = self._build_single_tree(lig_dict, fp) metadata = dict() metadata["activityTypes"] = [act.to_dict() for act in activities] metadata["treeTypes"] = [fp.to_dict() for fp in fps] metadata["circleSizeTypes"] = [prop.to_dict() for prop in other_properties] metadata["circleBorderTypes"] = [prop.to_dict() for prop in other_properties] metadata["external"] = ext_links ext_names = [ext["name"] for ext in ext_links] comp_info = self.gen_properties(lig_dict, activities, other_properties, ext_names) final_dict = {"metadata": metadata, "trees": trees, "compounds": comp_info} WriteJSON(final_dict, outfile=output_file, write_type="w") # make image file self.make_structures_for_smiles(lig_dict) # delete tmp folder shutil.rmtree(TMP_FOLDER) def _build_single_tree(self, lig_dict, fp): """ Build a single tree with fingerprint function :param lig_dict: all ligand information :param fp: fingerprint object :return: dot filename """ distfile = self.gen_dist_file(lig_dict, fp.fp_func) newick_o = self.run_rapidnj(distfile) dot_inf = self.write_dotfile(newick_o) dot_out = self.sfdp_dot(dot_inf, 10) dot_dict = self.dot2dict(dot_out) return dot_dict @staticmethod
[docs] def parse_lig_file(in_file, identifier): """ parse ligand file and return a dictionary with identifier as IDs :param in_file: input file directory :param identifier: name for the identifier :return: a dictionray with ligand information """ return ParseLigandFile(in_file, identifier)
@staticmethod
[docs] def gen_dist_file(liganddict, fp_func): """ generate distance file which is the input of rapidnj program. :param liganddict: ligand information :param fp_func: fingerprint function :return: filename for distance file """ smile_list = [ [lig_name, liganddict[lig_name][SMILE_COLUMNNAME]] for lig_name in liganddict.keys()] print "finish smile list" filename = WriteAsPHYLIPFormat(smile_list, fp_func) print "finish writing phyli file" return filename
[docs] def run_rapidnj(self, distance_file): """ run rapidnj program on distance_file :param distance_file: directory of distance file :return: newick string """ proc = subprocess.Popen([self._RAPIDNJ_COMMAND, distance_file, "-i", "pd"], stdout=subprocess.PIPE) newick = proc.stdout.read() return newick
@staticmethod
[docs] def write_dotfile(newick): """ write newick string as dot file :param newick: newick string :return: dot file """ return WriteDotFile(newick)
[docs] def sfdp_dot(self, dot_infile, size): """ run sdfp on dot file :param dot_infile: directory for dot file :param size: parameter for the sfdp :return: new filename """ fmt= self._FILE_FORMAT + '_sfdp.gv' newfilename = datetime.datetime.now().strftime(fmt) if os.path.isfile(newfilename): os.remove(newfilename) command = "sfdp -Gsmoothing=triangle -Gsize={size} {infile} > {outfile}".format(size=size, infile=dot_infile, outfile=newfilename) subprocess.Popen( command, shell = True, stdout = subprocess.PIPE ).communicate() RemoveBackSlash(newfilename) return newfilename
@staticmethod
[docs] def dot2dict(dot_outfile): return Dot2Dict(dot_outfile, None)
@staticmethod
[docs] def gen_properties(ligand_dict, activities, properties, ext_cols): """ Generate properties for each molecule. :param ligand_dict: ligand dictionary which keep all ligand information :param activities: a list of PropertyType objects :param properties: a list of PropertyType objects :param ext_cols: the column name for external links :return: """ compounds = [] for idx in range(len(ligand_dict)): lid = "B" + str(idx) comp = dict() comp["id"] = lid comp["orig_id"] = ligand_dict[lid]["orig_id"] comp["activities"] = dict() comp["properties"] = dict() comp["external"] = dict() for act in activities: comp["activities"][act.name] = act.gen_property(ligand_dict[lid]) for prop in properties: comp["properties"][prop.name] = prop.gen_property(ligand_dict[lid]) for col in ext_cols: ext_val = ligand_dict[lid][col] if isinstance(ext_val, float): comp[col] = str(int(ext_val)) else: comp[col] = str(ext_val) compounds.append(comp) return compounds
@staticmethod
[docs] def make_structures_for_smiles( ligand_dict ): """ Make structure figures from smile strings. All image files will be in the IMG_DIR :param ligand_dict: ligand dictionary which keep all ligand information :return: """ relative_dir = IMG_DIR for key in ligand_dict: smile = ligand_dict[key][ SMILE_COLUMNNAME ] filename = ligand_dict[ key ][ "orig_id" ] mol = Chem.MolFromSmiles(smile) try: MolToFile( mol, os.path.join(relative_dir, '{}.svg'.format(filename)) ) except: raise Exception("cannot write to file: " + os.path.join(relative_dir, '{}.svg'.format(filename)))