Source code for rnaglib.utils.misc

import json
import os
from pathlib import Path
from typing import Optional, Union

import gemmi
from torch.utils.data import Subset


def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]


def tonumpy(torch_array):
    """Routine to get any torch tensor back to numpy without cpu or gradient problems

    :param torch_array: A torch array

    :return: A numpy array
    """
    return torch_array.detach().cpu().numpy()


def get_dataset(loader):
    """Using torch subsets have a weird property that messes up the dataset accessing.
     This is the recommended fix in https://discuss.pytorch.org/t/how-to-get-a-part-of-datasets/82161/7

    :param dataset: either a dataset or a Subset
    :param node_sim:

    :return:
    """
    if isinstance(loader, Subset):
        return loader.dataset.dataset
    return loader.dataset


def load_index(redundancy="nr", version="1.0.0", glib_path=f"{os.path.expanduser('~')}/.rnaglib"):
    index_file = os.path.join(glib_path, f"indexes/rnaglib-{redundancy}-{version}.json")

    try:
        with open(index_file) as indx:
            return json.load(indx)
    except FileNotFoundError:
        print(f"Index file not found at {index_file}. Run rnaglib_index")
        return None



[docs]
def cif_remove_residues(
    cif_path: Union[str, os.PathLike],
    keep_residues: Union[list, None],
    out_path: Union[str, os.PathLike],
    file_type: str = "cif",
):
    """Remove all residues from a cif file except for those in `keep_residues` list.

    Save the new cif to `out_path`.
    :param cif_path: path to input cif
    :param keep_residues: list of residue IDs in format (chain_id, position) to keep.
    :param out_path: path to write new cif file
    """
    # Load the mmCIF file
    cif_model = gemmi.read_structure(str(cif_path))

    # Iterate through models and chains
    for model in cif_model:
        empty_chains = []
        for chain in model:
            for i in range(len(chain) - 1, -1, -1):
                res = chain[i]
                if (chain.name, int(res.seqid.num)) not in keep_residues:
                    del chain[i]
            # Mark chain for removal if it's empty
            if len(chain) == 0:
                empty_chains.append(chain.name)

        # Remove empty chains
        for chain_name in empty_chains:
            model.remove_chain(chain_name)
    # Save the modified structure to a new mmCIF or PDB file
    if file_type == "cif":
        cif_model.make_mmcif_document().write_file(str(out_path))
    else:
        cif_model.write_pdb(str(out_path))



def filter_cif_with_res(cif_path, keep_residues, out_path, file_type="cif"):
    # Read the CIF file and keep only the first model.
    structure = gemmi.read_structure(str(cif_path))
    if len(structure) > 1:
        del structure[1:]
    try:
        model = structure[0]
    except IndexError:
        model = structure

    # Hash things to keep
    keep_residues = set(keep_residues)
    keep_chains = set([x[0] for x in keep_residues])

    # Iterate over all residues and keep only the allowed ones
    # Use reverse to avoid index errors after deletions
    for i, chain in reversed(list(enumerate(model))):
        # Skip the whole chain if not using it
        if chain.name not in keep_chains:
            del model[i]
            continue
        for i, res in reversed(list(enumerate(chain))):
            if (chain.name, int(res.seqid.num)) not in keep_residues:
                del chain[i]

    structure.remove_empty_chains()
    # Save the modified structure to a new mmCIF or PDB file
    if file_type == "cif":
        structure.make_mmcif_document().write_file(str(out_path))
    else:
        structure.write_pdb(str(out_path))


def clean_mmcif(
    cif_path: Union[str, os.PathLike],
    output_path: Union[str, os.PathLike] = ".",
    file_type: str = "cif",
):
    """Remove non-RNA entities.

    :param cif_path: path to input cif
    :param output_path: path to cleaned structure.
    """
    # Load the MMCIF file
    structure = gemmi.read_structure(str(cif_path))
    rna_residues = ["A", "U", "C", "G"]

    chain_structure = gemmi.Structure()
    chain_structure.add_model(gemmi.Model(0))
    # Iterate over all models and chains
    paths = []
    for chain in structure[0]:
        if not any(res.name in rna_residues for res in chain):
            continue
        # Add the chain to the new structure
        chain_copy = chain.clone()
        chain_structure[0].add_chain(chain_copy)

    if file_type == "cif":
        with open(output_path, "w") as f:
            f.write(chain_structure.make_mmcif_document().as_string())
    else:
        with open(output_path, "w") as f:
            f.write(chain_structure.make_pdb_string())


def subset_mmcif_biopython(input_cif, allowed_residues, output_cif):
    from Bio.PDB import FastMMCIFParser, Select, PDBIO

    class ResidueSelector(Select):
        """Selector to filter residues by chain and residue ID."""

        def __init__(self, allowed_residues):
            super().__init__()
            self.allowed_residues = set(allowed_residues)  # List of (chain_id, res_id)

        def accept_residue(self, residue):
            chain_id = residue.parent.id  # Parent of residue is a chain
            res_id = residue.id[1]  # Residue ID is a tuple, use the numeric part
            return (chain_id, res_id) in self.allowed_residues

    # Parse the CIF file
    parser = FastMMCIFParser(QUIET=True)
    structure = parser.get_structure("structure", input_cif)
    io = PDBIO()
    io.set_structure(structure)
    io.save(output_cif, ResidueSelector(allowed_residues))


def split_mmcif_by_chain(
    cif_path: Union[str, os.PathLike],
    output_dir: Union[str, os.PathLike] = ".",
    prefix=None,
    min_length=0,
    max_length=1000,
):
    """Write one mmCIF for each chain in the input mmCIF.

    :param cif_path: path to input cif
    :param prefix: string to use as the name for new mmCIFs followed by chain ID. If None, use input file name.
    :param output_dir: Directory for dumping new mmCIFs.
    :param out_path: path to write new cif file

    :return: list of paths to new chain PDBs
    """
    if prefix is None:
        prefix = Path(cif_path).stem
    # Load the MMCIF file
    structure = gemmi.read_structure(str(cif_path))
    rna_residues = ["A", "U", "C", "G"]

    # Iterate over all models and chains
    paths = []
    for chain in structure[0]:
        if (len(chain) < min_length) or (len(chain) > max_length):
            continue
        if not any(res.name in rna_residues for res in chain):
            continue
        # Create a new structure for the chain
        chain_structure = gemmi.Structure()
        chain_structure.add_model(gemmi.Model(chain.name))

        # Add the chain to the new structure
        chain_copy = chain.clone()
        chain_structure[0].add_chain(chain_copy)

        # Output file name
        output_file = Path(output_dir) / f"{prefix}_{chain.name}.cif"
        # Write the chain to a new MMCIF file
        paths.append(output_file)
        with open(output_file, "w") as f:
            f.write(chain_structure.make_mmcif_document().as_string())
    return paths