Annotation하고자 하는 interval들을 interval tree에 저장 후 annotation BED 파일을 순회하면서 tree에 query를 던진다.
from intervaltree import IntervalTree
from collections import defaultdict
data = pd.read_csv(INTERVAL_FILE) # Requires chrom, start, end columns.
bed = pd.read_csv(ANNOTATION_FILE)
intervals = defaultdict(IntervalTree)
for i, r in enumerate(data.to_records()):
intervals[r.chrom].addi(r.start, r.end, i)
mask = [False] * len(data) # Annotation mask!
for r in bed.to_records():
# Envelop query.
for (start, end, idx) in intervals[r.chrom].envelop(r.start, r.end):
mask[idx] = True
# Or, Overlap query.
for (start, end, idx) in intervals[r.chrom][r.start:, r.end]:
mask[idx] = True
Python
복사