Source code for rnaglib.utils.wrappers

""" Various wrappers for external tools """

import os
import re
import shutil
from pathlib import Path
from collections import defaultdict
from typing import Union, Optional
import tempfile
import subprocess

from rnaglib.utils import cif_remove_residues


def US_align_wrapper(
    cif_path_1: Union[str, os.PathLike],
    cif_path_2: Union[str, os.PathLike],
    flags: tuple = ("-mm", "1", "-ter", "1"),
    reslist_1: Optional[list] = None,
    reslist_2: Optional[list] = None,
):
    """Calls USalign on two mmCIF files and returns the output.
    Must have USalign (https://zhanggroup.org/US-align/bin/module/USalign.cpp) in your executable path.
    """

    assert Path(cif_path_1).exists(), f"{cif_path_1} missing"
    assert Path(cif_path_2).exists(), f"{cif_path_2} missing"

    with tempfile.TemporaryDirectory() as tmpdir:
        if not reslist_1 is None:
            new_cif_1 = Path(tmpdir) / "rna_1.cif"
            cif_remove_residues(cif_path_1, reslist_1, new_cif_1)
            cif_path_1 = new_cif_1
        if not reslist_2 is None:
            new_cif_2 = Path(tmpdir) / "rna_2.cif"
            cif_remove_residues(cif_path_2, reslist_2, new_cif_2)
            cif_path_2 = new_cif_2

        command = [
            "USalign",
            *flags,
            cif_path_1,
            cif_path_2,
        ]

    result = subprocess.run(command, capture_output=True, text=True)
    print(result.stdout)
    # Regular expression to find all TM-scores
    tm_scores = re.findall(r"TM-score=\s*([\d.]+)", result.stdout)

    # Convert to float if needed
    tm_scores = [float(score) for score in tm_scores]

    try:
        return max(tm_scores)
    except ValueError:
        return None


[docs]def rna_align_wrapper( cif_path_1: Union[str, os.PathLike], cif_path_2: Union[str, os.PathLike], flags: tuple = ("-a", "T"), reslist_1: Optional[list] = None, reslist_2: Optional[list] = None, ): """Calls RNAalign on two mmCIF files and returns the output. Must have RNAalign (https://zhanggroup.org/RNA-align/download.html) in your executable path. """ # assert shutil.which('RNAalign') is not None,\ # "RNAalign installation not found. Go here https://zhanggroup.org/RNA-align/" assert Path(cif_path_1).exists(), f"{cif_path_1} missing" assert Path(cif_path_2).exists(), f"{cif_path_2} missing" with tempfile.TemporaryDirectory() as tmpdir: if not reslist_1 is None: new_cif_1 = Path(tmpdir) / "rna_1.cif" cif_remove_residues(cif_path_1, reslist_1, new_cif_1) cif_path_1 = new_cif_1 if not reslist_2 is None: new_cif_2 = Path(tmpdir) / "rna_1.cif" cif_remove_residues(cif_path_2, reslist_2, new_cif_2) cif_path_2 = new_cif_2 command = [ "RNAalign", *flags, cif_path_1, cif_path_2, ] result = subprocess.run(command, capture_output=True, text=True) if "-a" in flags: pattern = r"TM-score=\s*([\d.]+)\s*\(if normalized by average length of two structures" match = re.search(pattern, result.stdout) if match: tm = float(match.group(1)) return tm else: print(result.stderr) print(result.stdout) return None pass
[docs]def locarna_wrapper(seq_1: str, seq_2: str): """Calls LocaRNA on two RNA sequences to perform sequence-2d-structure aligment""" with tempfile.TemporaryDirectory as tdir: seq_1_file = Path(tdir) / "seq_1.fa" seq_2_file = Path(tdir) / "seq_2.fa" with open(seq_1_file, "w") as s1: seq_1_file.write(f"> s1\n {seq_1}") with open(seq_2_file, "w") as s2: seq_2_file.write(f"> s2\n {seq_2}") command = ["locarna", seq_1_file, seq_2_file] result = subprocess.run(command, capture_output=True, text=True) pass
[docs]def cdhit_wrapper(ids, sequences, sim_thresh=0.6, n_jobs=1): """Cluster sequences using CD-hit. Adapted from ProteinShake. Choose of word size: -n 5 for thresholds 0.7 ~ 1.0 -n 4 for thresholds 0.6 ~ 0.7 -n 3 for thresholds 0.5 ~ 0.6 -n 2 for thresholds 0.4 ~ 0.5 Parameters ----------- sequences: list List of protein sequences to cluster. Returns -------- representatives: list List of sequence indices to preserve as representatives. """ assert sim_thresh >= 0.4 and sim_thresh <= 1, "Similarity threshold not in [0.4, 1]" if sim_thresh >= 0.4 and sim_thresh < 0.5: word_size = 2 elif sim_thresh >= 0.5 and sim_thresh < 0.6: word_size = 3 elif sim_thresh >= 0.6 and sim_thresh < 0.7: word_size = 4 else: word_size = 5 """ assert shutil.which('cd-hit') is not None,\ "CD-HIT installation not found. Go here https://github.com/weizhongli/cdhit to install" """ n_jobs = 0 if n_jobs < 0 else n_jobs with tempfile.TemporaryDirectory() as tmpdir: in_file = Path(tmpdir) / "in.fasta" out_file = Path(tmpdir) / "out.fasta" with open(in_file, "w") as inp: for id, s in zip(ids, sequences): inp.write(f">{id}\n") inp.write(s + "\n") cmd = [ "cd-hit", "-c", str(sim_thresh), "-i", in_file, "-n", str(word_size), "-o", out_file, "-T", str(n_jobs), "-M", "0", # unlimited memory ] subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # parse cluster assignments pdb2cluster = {} cluster2pdb = defaultdict(list) with open(str(out_file) + ".clstr", "r") as out: for line in out: if line.startswith(">"): clust_id = int(line.split()[1]) continue pdb_id = line.split(">")[1].split(".")[0] pdb2cluster[pdb_id] = clust_id cluster2pdb[clust_id].append(pdb_id) return pdb2cluster, cluster2pdb