Source code for rnaglib.transforms.annotate.rbp
import os
from typing import Union
from pathlib import Path
import networkx as nx
from Bio.PDB import NeighborSearch
from Bio.PDB.MMCIFParser import FastMMCIFParser
from rnaglib.transforms import AnnotationTransform
protein_residues = {
"ALA",
"ARG",
"ASN",
"ASP",
"CYS",
"GLN",
"GLU",
"GLY",
"HIS",
"ILE",
"LEU",
"LYS",
"MET",
"PHE",
"PRO",
"SER",
"THR",
"TRP",
"TYR",
"VAL",
}
phosphate_atoms = {"P", "OP1", "OP2"}
[docs]
class RBPTransform(AnnotationTransform):
"""
Adds information at the residue level about the protein content of the environment of the residue. Two types of annotations are added: a binary feature
indicating whether each residue is binding a protein according to a user-defined threshold, and features indicating the number of protein atoms in radiuses of
certain distances around each residue. Since it is computationally expensive, it is by default only used during the dataset creation. The protein content
annotation is further used to discard RNA residues with high protein content for certain tasks.
:param structures_dir: Path to the directory where the structures are stored (ex. as cif files)
:param float distance_threshold: the radius (in Angstrom) of the zone considered as the environment of the residue (default 5.0)
:param bool protein_number_annotations: whether to add annotations regarding the number of protein atoms around each residue besides the binary annotation (default False)
:param list[float] distances: the list of the radiuses (in Angstroms) of the balls centered around the residues within which we would like to compute the number of protein atoms
:return: the annotated graph, actually the graph is mutated in place
"""
[docs]
def __init__(self, structures_dir: Union[os.PathLike, str], distance_threshold: float = 5.0, protein_number_annotations: bool = False, distances: list[float] = [0.5]):
self.structures_dir = structures_dir
self.distance_threshold = distance_threshold
self.protein_number_annotations = protein_number_annotations
self.distances = distances
pass
def forward(self, rna_dict: dict) -> dict:
"""Application of the transform to an RNA dictionary object
:param dict rna_dict: the RNA dictionary which has to be annotated with protein content information
:return: the annotated version of rna_dict
:rtype: dict
"""
# Load the structure
g = rna_dict["rna"]
cif = str(Path(self.structures_dir) / f"{g.graph['pdbid'].lower()}.cif")
parser = FastMMCIFParser(QUIET=True)
structure = parser.get_structure("", cif)
# set of tuples (chain, pos) for RNA nodes
rna_res_ids = set([tuple(n.split(".")[1:]) for n in g.nodes()])
# Extract atoms for RNA and Protein
rna_atoms = []
protein_atoms = []
rna_residues = []
for chain in structure[0]:
for residue in chain:
res_name = residue.get_resname()
if (chain.id, str(residue.id[1])) in rna_res_ids:
for atom in residue.get_atoms():
if atom.get_name() in phosphate_atoms:
rna_atoms.append(atom)
break
rna_residues.append(residue)
if residue.get_resname() in protein_residues:
for atom in residue.get_atoms():
if atom.get_name() == "CA":
protein_atoms.append(atom)
break
# Build a KDTree
all_rna_atoms = list(rna_atoms)
all_protein_atoms = list(protein_atoms)
close_residues = set()
if self.protein_number_annotations:
protein_numbers_list = [{} for _ in self.distances]
protein_proximity = len(all_protein_atoms) > 1
if protein_proximity:
neighbor_search = NeighborSearch(all_protein_atoms)
# Find RNA residues near protein residues
distance_threshold = self.distance_threshold
for rna_atom in all_rna_atoms:
close_atoms = neighbor_search.search(rna_atom.coord, distance_threshold)
if len(close_atoms) > 0:
rna_residue = rna_atom.get_parent()
close_residues.add((rna_residue.get_parent().id, rna_residue.id[1]))
if self.protein_number_annotations:
for i, current_distance_threshold in enumerate(self.distances):
close_atoms = neighbor_search.search(rna_atom.coord, current_distance_threshold)
rna_residue = rna_atom.get_parent()
protein_numbers_list[i][(rna_residue.get_parent().id, rna_residue.id[1])] = len(close_atoms)
# Output the results
rbp_status = {}
protein_numbers = {}
for node in g.nodes():
chain, pos = node.split(".")[1:]
rbp_status[node] = (chain, int(pos)) in close_residues
if self.protein_number_annotations:
node_protein_numbers_list = []
for i in range(len(self.distances)):
if protein_proximity:
try:
node_protein_numbers_list.append(protein_numbers_list[i][(chain,int(pos))])
except:
node_protein_numbers_list.append(0)
else:
node_protein_numbers_list.append(0)
protein_numbers[node] = node_protein_numbers_list
nx.set_node_attributes(g, rbp_status, "protein_binding")
if self.protein_number_annotations:
nx.set_node_attributes(g, protein_numbers, "protein_content")
for i, distance in enumerate(self.distances):
protein_numbers_distance = {node: protein_numbers[node][i] for node in protein_numbers}
nx.set_node_attributes(g, protein_numbers_distance, "protein_content_"+str(distance))
return rna_dict