Source code for rnaglib.utils.wrappers

"""Various wrappers for external tools"""

import os
import re
import subprocess
import tempfile
from collections import defaultdict
from pathlib import Path

from rnaglib.utils import cif_remove_residues


def US_align_wrapper(
    cif_path_1: str | os.PathLike,
    cif_path_2: str | os.PathLike,
    flags: tuple = ("-mm", "1", "-ter", "1"),
    reslist_1: list | None = None,
    reslist_2: list | None = None,
    file_type: str = "cif",
):
    """Calls USalign on two mmCIF files and returns the output.

    Must have USalign (https://zhanggroup.org/US-align/bin/module/USalign.cpp) in your executable path.
    """
    assert Path(cif_path_1).exists(), f"{cif_path_1} missing"
    assert Path(cif_path_2).exists(), f"{cif_path_2} missing"

    with tempfile.TemporaryDirectory() as tmpdir:
        if reslist_1 is not None:
            new_cif_1 = Path(tmpdir) / "rna_1." + str(file_type)
            cif_remove_residues(cif_path_1, reslist_1, new_cif_1, file_type)
            cif_path_1 = new_cif_1
        if reslist_2 is not None:
            new_cif_2 = Path(tmpdir) / "rna_2." + str(file_type)
            cif_remove_residues(cif_path_2, reslist_2, new_cif_2, file_type)
            cif_path_2 = new_cif_2

        command = [
            "USalign",
            *flags,
            cif_path_1,
            cif_path_2,
        ]

    result = subprocess.run(command, capture_output=True, text=True, check=False)
    # print(result.stdout)
    # uncomment above for debugging
    # Regular expression to find all TM-scores
    tm_scores = re.findall(r"TM-score=\s*([\d.]+)", result.stdout)

    # Convert to float if needed
    tm_scores = [float(score) for score in tm_scores]

    try:
        return max(tm_scores)
    except ValueError:
        return None


[docs] def rna_align_wrapper( cif_path_1: str | os.PathLike, cif_path_2: str | os.PathLike, flags: tuple = ("-a", "T"), reslist_1: list | None = None, reslist_2: list | None = None, ): """Calls RNAalign on two mmCIF files and returns the output. Must have RNAalign (https://zhanggroup.org/RNA-align/download.html) in your executable path. """ # assert shutil.which('RNAalign') is not None,\ # "RNAalign installation not found. Go here https://zhanggroup.org/RNA-align/" assert Path(cif_path_1).exists(), f"{cif_path_1} missing" assert Path(cif_path_2).exists(), f"{cif_path_2} missing" with tempfile.TemporaryDirectory() as tmpdir: if reslist_1 is not None: new_cif_1 = Path(tmpdir) / "rna_1.cif" cif_remove_residues(cif_path_1, reslist_1, new_cif_1) cif_path_1 = new_cif_1 if reslist_2 is not None: new_cif_2 = Path(tmpdir) / "rna_1.cif" cif_remove_residues(cif_path_2, reslist_2, new_cif_2) cif_path_2 = new_cif_2 command = [ "RNAalign", *flags, cif_path_1, cif_path_2, ] result = subprocess.run(command, capture_output=True, text=True, check=False) if "-a" in flags: pattern = r"TM-score=\s*([\d.]+)\s*\(if normalized by average length of two structures" match = re.search(pattern, result.stdout) if match: tm = float(match.group(1)) return tm print(result.stderr) print(result.stdout) return None
[docs] def locarna_wrapper(seq_1: str, seq_2: str): """Calls LocaRNA on two RNA sequences to perform sequence-2d-structure aligment""" with tempfile.TemporaryDirectory as tdir: seq_1_file = Path(tdir) / "seq_1.fa" seq_2_file = Path(tdir) / "seq_2.fa" with open(seq_1_file, "w") as s1: seq_1_file.write(f"> s1\n {seq_1}") with open(seq_2_file, "w") as s2: seq_2_file.write(f"> s2\n {seq_2}") command = ["locarna", seq_1_file, seq_2_file] result = subprocess.run(command, capture_output=True, text=True, check=False)
[docs] def cdhit_wrapper(ids, sequences, sim_thresh=0.6, n_jobs=1): """Cluster sequences using CD-hit. Adapted from ProteinShake. Choose of word size: -n 5 for thresholds 0.7 ~ 1.0 -n 4 for thresholds 0.6 ~ 0.7 -n 3 for thresholds 0.5 ~ 0.6 -n 2 for thresholds 0.4 ~ 0.5 Parameters ----------- sequences: list List of protein sequences to cluster. Returns: -------- representatives: list List of sequence indices to preserve as representatives. """ assert sim_thresh >= 0.4 and sim_thresh <= 1, "Similarity threshold not in [0.4, 1]" if sim_thresh >= 0.4 and sim_thresh < 0.5: word_size = 2 elif sim_thresh >= 0.5 and sim_thresh < 0.6: word_size = 3 elif sim_thresh >= 0.6 and sim_thresh < 0.7: word_size = 4 else: word_size = 5 """ assert shutil.which('cd-hit') is not None,\ "CD-HIT installation not found. Go here https://github.com/weizhongli/cdhit to install" """ n_jobs = max(n_jobs, 0) with tempfile.TemporaryDirectory() as tmpdir: in_file = Path(tmpdir) / "in.fasta" out_file = Path(tmpdir) / "out.fasta" with open(in_file, "w") as inp: for id, s in zip(ids, sequences, strict=False): inp.write(f">{id}\n") inp.write(s + "\n") cmd = [ "cd-hit", "-c", str(sim_thresh), "-i", in_file, "-n", str(word_size), "-l", "5", "-o", out_file, "-T", str(n_jobs), "-M", "0", # unlimited memory ] subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False) # parse cluster assignments pdb2cluster = {} cluster2pdb = defaultdict(list) with open(str(out_file) + ".clstr") as out: for line in out: if line.startswith(">"): clust_id = int(line.split()[1]) continue pdb_id = line.split(">")[1].split(".")[0] pdb2cluster[pdb_id] = clust_id cluster2pdb[clust_id].append(pdb_id) return pdb2cluster, cluster2pdb