Source code for rnaglib.transforms.annotate.rbp

import os
from typing import Union
from pathlib import Path

import networkx as nx
from Bio.PDB import NeighborSearch
from Bio.PDB.MMCIFParser import FastMMCIFParser

from rnaglib.transforms import AnnotationTransform

protein_residues = {
    "ALA",
    "ARG",
    "ASN",
    "ASP",
    "CYS",
    "GLN",
    "GLU",
    "GLY",
    "HIS",
    "ILE",
    "LEU",
    "LYS",
    "MET",
    "PHE",
    "PRO",
    "SER",
    "THR",
    "TRP",
    "TYR",
    "VAL",
}

phosphate_atoms = {"P", "OP1", "OP2"}



[docs]
class RBPTransform(AnnotationTransform):
    """
    Adds information at the residue level about the protein content of the environment of the residue. Two types of annotations are added: a binary feature 
    indicating whether each residue is binding a protein according to a user-defined threshold, and features indicating the number of protein atoms in radiuses of
    certain distances around each residue. Since it is computationally expensive, it is by default only used during the dataset creation. The protein content
    annotation is further used to discard RNA residues with high protein content for certain tasks.

    :param structures_dir: Path to the directory where the structures are stored (ex. as cif files)
    :param float distance_threshold: the radius (in Angstrom) of the zone considered as the environment of the residue (default 5.0)
    :param bool protein_number_annotations: whether to add annotations regarding the number of protein atoms around each residue besides the binary annotation (default False)
    :param list[float] distances: the list of the radiuses (in Angstroms) of the balls centered around the residues within which we would like to compute the number of protein atoms
    :return: the annotated graph, actually the graph is mutated in place
    """


[docs]
    def __init__(self, structures_dir: Union[os.PathLike, str], distance_threshold: float = 5.0, protein_number_annotations: bool = False, distances: list[float] = [0.5]):
        self.structures_dir = structures_dir
        self.distance_threshold = distance_threshold
        self.protein_number_annotations = protein_number_annotations
        self.distances = distances
        pass


    def forward(self, rna_dict: dict) -> dict:
        """Application of the transform to an RNA dictionary object

        :param dict rna_dict: the RNA dictionary which has to be annotated with protein content information
        :return: the annotated version of rna_dict
        :rtype: dict
        """
        # Load the structure
        g = rna_dict["rna"]
        cif = str(Path(self.structures_dir) / f"{g.graph['pdbid'].lower()}.cif")
        parser = FastMMCIFParser(QUIET=True)
        structure = parser.get_structure("", cif)

        # set of tuples (chain, pos) for RNA nodes
        rna_res_ids = set([tuple(n.split(".")[1:]) for n in g.nodes()])

        # Extract atoms for RNA and Protein
        rna_atoms = []
        protein_atoms = []
        rna_residues = []

        for chain in structure[0]:
            for residue in chain:
                res_name = residue.get_resname()
                if (chain.id, str(residue.id[1])) in rna_res_ids:
                    for atom in residue.get_atoms():
                        if atom.get_name() in phosphate_atoms:
                            rna_atoms.append(atom)
                            break
                    rna_residues.append(residue)
                if residue.get_resname() in protein_residues:
                    for atom in residue.get_atoms():
                        if atom.get_name() == "CA":
                            protein_atoms.append(atom)
                            break

        # Build a KDTree
        all_rna_atoms = list(rna_atoms)
        all_protein_atoms = list(protein_atoms)

        close_residues = set()

        if self.protein_number_annotations:
            protein_numbers_list = [{} for _ in self.distances]

        protein_proximity = len(all_protein_atoms) > 1

        if protein_proximity:
            neighbor_search = NeighborSearch(all_protein_atoms)

            # Find RNA residues near protein residues
            distance_threshold = self.distance_threshold

            for rna_atom in all_rna_atoms:
                close_atoms = neighbor_search.search(rna_atom.coord, distance_threshold)
                if len(close_atoms) > 0:
                    rna_residue = rna_atom.get_parent()
                    close_residues.add((rna_residue.get_parent().id, rna_residue.id[1]))
                if self.protein_number_annotations:
                    for i, current_distance_threshold in enumerate(self.distances):
                        close_atoms = neighbor_search.search(rna_atom.coord, current_distance_threshold)
                        rna_residue = rna_atom.get_parent()
                        protein_numbers_list[i][(rna_residue.get_parent().id, rna_residue.id[1])] = len(close_atoms)


        # Output the results
        rbp_status = {}
        protein_numbers = {}
        for node in g.nodes():
            chain, pos = node.split(".")[1:]
            rbp_status[node] = (chain, int(pos)) in close_residues
            if self.protein_number_annotations:
                node_protein_numbers_list = []
                for i in range(len(self.distances)):
                    if protein_proximity:
                        try:
                            node_protein_numbers_list.append(protein_numbers_list[i][(chain,int(pos))])
                        except:
                            node_protein_numbers_list.append(0)
                    else:
                        node_protein_numbers_list.append(0)
                protein_numbers[node] = node_protein_numbers_list

        nx.set_node_attributes(g, rbp_status, "protein_binding")
        if self.protein_number_annotations:
            nx.set_node_attributes(g, protein_numbers, "protein_content")
            for i, distance in enumerate(self.distances):
                protein_numbers_distance = {node: protein_numbers[node][i] for node in protein_numbers}
                nx.set_node_attributes(g, protein_numbers_distance, "protein_content_"+str(distance))
        return rna_dict