#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""class for annotating junction reads
"""
from __future__ import annotations
from collections import defaultdict
from typing import Any
import gffutils
import numpy as np
from .utils import get_yaml
from .utils import timethis
# HardCode the information of chromosome because its name of two ref are not identical
CHROMS = get_yaml()["chr2hg38"]
[docs]class Annotator:
"""annotate junction reads
:param junctionmap: instance return :class:`ce_detector.detector.JunctionMap`
:type junctionmap: instance
:param database: database of annotation files
:type database: Any
:param output: filename of annotated junction reads. Defaults to None
:type output: TestIo
"""
def __init__(self, database: Any, output=None):
"""Constructor of Annotator"""
self.database = gffutils.FeatureDB(database)
self.output = output
[docs] @staticmethod
def detect_property(start, end, junction_list):
"""detect type of slice, number of skipped donors and number of skipped acceptors
type of slice including D A DA N NDA
:param start: start of junction read
:type start: int
:param end: end of junction read
:type end: int
:param junction_list: gene list of junction reads
:type junction_list: numpy.array
:return: type of slice, number of skipped donors, number of skipped acceptors
"""
known_donors = junction_list[:, 0]
known_acceptors = junction_list[:, 1]
donors_skipped = ((start + 1 < known_donors) & (known_donors < end)).sum()
acceptors_skipped = (
(start + 1 < known_acceptors) & (known_acceptors < end)
).sum()
if [start + 1, end] in junction_list.tolist():
reads_type = "DA"
elif (start + 1 in known_donors) and (end in known_acceptors):
reads_type = "NDA"
elif (start + 1 in known_donors) and (end not in known_acceptors):
reads_type = "D"
elif (start + 1 not in junction_list[:, 0]) and (end in junction_list[:, 1]):
reads_type = "A"
else:
reads_type = "N"
return reads_type, donors_skipped, acceptors_skipped
[docs] def annotate_junction(self, read, result, db):
"""annotate junction reads and write results to file
:param read: junction read return :class:`ce_detector.detector.Read`
:type read: instance
:param result: gene list used for annotation
:type result: defaultdict[Any, Any]
:param db: database of annotation file
:type db: instance of file
"""
chrom = read.chrom
start, end = read.start, read.end
region = f"{CHROMS[chrom]}:{start}-{end}" # change chromosome
gene_list = []
for gene in db.features_of_type(("gene"), limit=region):
gene_list.append(gene.id)
# new gene
if gene.id not in result:
# transcript
for transcript in db.children(
gene,
level=1,
featuretype=("primary_transcript", "transcript", "mRNA"),
): # mRNA may drop out
# find all position of introns for every gene known junctions
for junction in db.interfeatures(
db.children(
transcript,
level=1,
featuretype="exon",
order_by="start",
),
new_featuretype="intron",
):
result[gene.id].append([junction.start, junction.end])
if (
len(result[gene.id]) >= 1
): # drop gene with only one exon in terms of number of intron
result[gene.id] = np.unique(result[gene.id], axis=0)
else:
result.pop(gene.id)
gene_list.pop()
# annotate junctions reads
for gene in gene_list:
junction_list = result[gene]
reads_type, donors_skipped, acceptors_skipped = self.detect_property(
start,
end,
junction_list,
)
read.information.append(
[reads_type, donors_skipped, acceptors_skipped, gene],
)
if self.output:
self.output.write(
f"{read}\t{reads_type}\t{donors_skipped}\t{acceptors_skipped}\t{gene}\n",
)
[docs] @timethis(name="Junction Annotator", message=" ")
def run(self, junctionmap, logger, verbose=False):
"""main function used to annotate junction reads
pick all genes covered by one junction read and annotate all of them:
type of slice, number of skipped donors and number of skipped acceptors
"""
result = defaultdict(list)
# logger.info(f'Begin to annotate junctions!')
for _index, read in enumerate(junctionmap):
self.annotate_junction(read, result, self.database)
if verbose and _index % 1000 == 0:
logger.info(f"Chrom {junctionmap.chrom} {_index} Reads")
return junctionmap