cool_seq_tool.sources.uta_database#
Provide transcript lookup and metadata tools via the UTA database.
- class cool_seq_tool.sources.uta_database.ParseResult(pr)[source]#
Subclass of url.ParseResult that adds database and schema methods, and provides stringification. Source: https://github.com/biocommons/hgvs
- class cool_seq_tool.sources.uta_database.UtaDatabase(db_url=UTA_DB_URL, chain_file_37_to_38=None, chain_file_38_to_37=None)[source]#
Provide transcript lookup and metadata tools via the Universal Transcript Archive (UTA) database.
Users should use the
create()
method to construct a new instance. Note that almost all public methods are defined asasync
– see the Usage section for more information.>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> uta_db = asyncio.run(UtaDatabase.create())
- __init__(db_url=UTA_DB_URL, chain_file_37_to_38=None, chain_file_38_to_37=None)[source]#
Initialize DB class. Should only be used by
create()
method, and not be called directly by a user.- Parameters:
db_url (
str
) – PostgreSQL connection URL Format:driver://user:password@host/database/schema
chain_file_37_to_38 (
Optional
[str
]) – Optional path to chain file for 37 to 38 assembly. This is used foragct
. If this is not provided, will check to see ifLIFTOVER_CHAIN_37_TO_38
env var is set. If neither is provided, will allowagct
to download a chain file from UCSCchain_file_38_to_37 (
Optional
[str
]) – Optional path to chain file for 38 to 37 assembly. This is used foragct
. If this is not provided, will check to see ifLIFTOVER_CHAIN_38_TO_37
env var is set. If neither is provided, will allowagct
to download a chain file from UCSC
- async classmethod create(db_url=UTA_DB_URL)[source]#
Manufacture a fully-initialized class instance (a la factory pattern). This method should be used instead of calling the class directly to create a new instance.
>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> uta_db = asyncio.run(UtaDatabase.create())
- Parameters:
cls – supplied implicitly
db_url (
str
) – PostgreSQL connection URL Format:driver://user:password@host/database/schema
- Return type:
TypeVar
(UTADatabaseType
, bound= UtaDatabase)- Returns:
UTA DB access class instance
- static data_from_result(result)[source]#
Return data found from result.
- Parameters:
result (
List
) – Data from tx_exon_aln_v table- Return type:
Optional
[Dict
]- Returns:
Gene, strand, and position ranges for tx and alt_ac
- async execute_query(query)[source]#
Execute a query and return its result.
- Parameters:
query (
str
) – Query to make on database- Return type:
Any
- Returns:
Query’s result
- async get_ac_descr(ac)[source]#
Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds.
>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> async def describe(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_ac_descr("NC_000001.10") ... return result >>> asyncio.run(describe()) 'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
- Parameters:
ac (
str
) – chromosome accession, e.g."NC_000001.10"
- Return type:
Optional
[str
]- Returns:
Description containing assembly and chromosome
- async get_ac_from_gene(gene)[source]#
Return genomic accession(s) associated to a gene.
- Parameters:
gene (
str
) – Gene symbol- Return type:
List
[str
]- Returns:
List of genomic accessions, sorted in desc order
- async get_alt_ac_start_or_end(tx_ac, tx_exon_start, tx_exon_end, gene)[source]#
Get genomic data for related transcript exon start or end.
- Parameters:
tx_ac (
str
) – Transcript accessiontx_exon_start (
int
) – Transcript’s exon start coordinatetx_exon_end (
int
) – Transcript’s exon end coordinategene (
Optional
[str
]) – HGNC gene symbol
- Return type:
Tuple
[Optional
[Tuple
[str
,str
,int
,int
,int
]],Optional
[str
]]- Returns:
[hgnc symbol, genomic accession for chromosome, aligned genomic start coordinate, aligned genomic end coordinate, strand], and warnings if found
- async get_cds_start_end(tx_ac)[source]#
Get coding start and end site
- Parameters:
tx_ac (
str
) – Transcript accession- Return type:
Optional
[Tuple
[int
,int
]]- Returns:
[Coding start site, Coding end site]
- async get_chr_assembly(ac)[source]#
Get chromosome and assembly for NC accession if not in GRCh38.
- Parameters:
ac (
str
) – NC accession- Return type:
Optional
[Tuple
[str
,str
]]- Returns:
Chromosome and Assembly accession is on
- async get_gene_from_ac(ac, start_pos, end_pos)[source]#
Get gene(s) within the provided coordinate range
>>> import asyncio >>> from cool_seq_tool.sources import UtaDatabase >>> async def get_gene(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802) ... return result >>> asyncio.run(get_gene()) ['BRCA1']
- Parameters:
ac (
str
) – NC accession, e.g."NC_000001.11"
start_pos (
int
) – Start position changeend_pos (
int
) – End position change
- Return type:
Optional
[List
[str
]]- Returns:
List of HGNC gene symbols
- async get_genes_and_alt_acs(pos, strand=None, chromosome=None, alt_ac=None, gene=None)[source]#
Return genes and genomic accessions for a position on a chromosome or alt_ac
- Parameters:
pos (
int
) – Genomic positionstrand (
Optional
[Strand
]) – Strandchromosome (
Optional
[int
]) – Chromosome. Must give chromosome without a prefix (i.e.1
orX
). If not provided, must providealt_ac
. Ifalt_ac
is also provided,alt_ac
will be used.alt_ac (
Optional
[str
]) – Genomic accession (i.e.NC_000001.11
). If not provided, must providechromosome. If ``chromosome
is also provided,alt_ac
will be used.gene (
Optional
[str
]) – Gene symbol
- Return type:
Tuple
[Optional
[Dict
],Optional
[str
]]- Returns:
Dictionary containing genes and genomic accessions and warnings if found
- async get_genomic_tx_data(tx_ac, pos, annotation_layer=AnnotationLayer.CDNA, alt_ac=None, target_genome_assembly=Assembly.GRCH38)[source]#
Get transcript mapping to genomic data.
- Parameters:
tx_ac (
str
) – Accession on c. coordinatepos (
Tuple
[int
,int
]) – (start pos, end pos)annotation_layer (
Union
[c, g]) – Annotation layer forac
andpos
alt_ac (
Optional
[str
]) – Accession on g. coordinatetarget_genome_assembly (
Assembly
) – Genome assembly to get genomic data for. Ifalt_ac
is provided, it will return the associated assembly.
- Return type:
Optional
[Dict
]- Returns:
Gene, Transcript accession and position change, Altered transcript accession and position change, Strand
- get_liftover(chromosome, pos, liftover_to_assembly)[source]#
Get new genome assembly data for a position on a chromosome.
- Parameters:
chromosome (
str
) – The chromosome number. Must be prefixed withchr
pos (
int
) – Position on the chromosomeliftover_to_assembly (
Assembly
) – Assembly to liftover to
- Return type:
Optional
[Tuple
[str
,int
]]- Returns:
Target chromosome and target position for assembly
- async get_mane_c_genomic_data(ac, alt_ac, start_pos, end_pos)[source]#
Get MANE transcript and genomic data. Used when going from g. to MANE c. representation.
>>> import asyncio >>> from cool_seq_tool.sources import UtaDatabase >>> async def get_braf_mane(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_mane_c_genomic_data( ... "NM_004333.6", ... None, ... 140753335, ... 140753335, ... ) ... return result >>> braf = asyncio.run(get_braf_mane()) >>> braf["alt_ac"] 'NC_000007.14'
- Parameters:
ac (
str
) – MANE transcript accessionalt_ac (
Optional
[str
]) – NC accession. Used to triangulate on correct genomic data. Can be set toNone
if unavailable.start_pos (
int
) – Genomic start positionend_pos (
int
) – Genomic end position change
- Return type:
Optional
[Dict
]- Returns:
MANE transcript results if successful
- async get_newest_assembly_ac(ac)[source]#
Find accession associated to latest genomic assembly
- Parameters:
ac (
str
) – Accession- Return type:
List
[str
]- Returns:
List of accessions associated to latest genomic assembly. Order by desc
- static get_secret()[source]#
Get secrets for UTA DB instances. Used for deployment on AWS.
- Return type:
str
- async get_transcripts(start_pos=None, end_pos=None, gene=None, use_tx_pos=True, alt_ac=None)[source]#
Get transcripts for a given
gene
oralt_ac
related to optional positions.- Parameters:
start_pos (
Optional
[int
]) – Start position change If not provided andend_pos
not provided, all transcripts associated with the gene and/or accession will be returnedend_pos (
Optional
[int
]) – End position change If not provided andstart_pos
not provided, all transcripts associated with the gene and/or accession will be returnedgene (
Optional
[str
]) – HGNC gene symboluse_tx_pos (
bool
) –True
if querying on transcript position. This meansstart_pos
andend_pos
are c. coordinate positions.False
if querying on genomic position. This meansstart_pos
andend_pos
are g. coordinate positionsalt_ac (
Optional
[str
]) – Genomic accession. If not provided, must providegene
- Return type:
DataFrame
- Returns:
Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length
- async get_transcripts_from_genomic_pos(alt_ac, g_pos)[source]#
Get transcripts associated to a genomic ac and position.
- Parameters:
alt_ac (
str
) – Genomic accessiong_pos (
int
) – Genomic position
- Return type:
List
[str
]- Returns:
RefSeq transcripts on c. coordinate
- async get_tx_exon_aln_v_data(tx_ac, start_pos, end_pos, alt_ac=None, use_tx_pos=True, like_tx_ac=False)[source]#
Return queried data from tx_exon_aln_v table.
- Parameters:
tx_ac (
str
) – accession on c. coordinatestart_pos (
int
) – Start position changeend_pos (
int
) – End position changealt_ac (
Optional
[str
]) – accession on g. coordinateuse_tx_pos (
bool
) –True
if querying on transcript position. This meansstart_pos
andend_pos
are on the c. coordinateFalse
if querying on genomic position. This meansstart_pos
andend_pos
are on the g. coordinatelike_tx_ac (
bool
) –True
if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its versionFalse
if tx_condition will be exact match
- Return type:
List
- Returns:
List of tx_exon_aln_v data
- async get_tx_exons(tx_ac, alt_ac=None)[source]#
Get list of transcript exons start/end coordinates.
- Parameters:
tx_ac (
str
) – Transcript accessionalt_ac (
Optional
[str
]) – Genomic accession
- Return type:
Tuple
[Optional
[List
[Tuple
[int
,int
]]],Optional
[str
]]- Returns:
List of a transcript’s accessions and warnings if found
- async get_tx_exons_genomic_coords(tx_ac, alt_ac)[source]#
Get exon number, transcript coordinates, and genomic coordinates
- Parameters:
tx_ac (
str
) – Transcript accessionalt_ac (
str
) – RefSeq genomic accession
- Return type:
Tuple
[Optional
[Tuple
[int
,int
,int
,int
,int
]],Optional
[str
]]- Returns:
Tuple of exon numbers, transcript and genomic coordinates, and warnings if found
- async liftover_to_38(genomic_tx_data)[source]#
Liftover genomic_tx_data to hg38 assembly.
- Parameters:
genomic_tx_data (
Dict
) – Dictionary containing gene, nc_accession, alt_pos, and strand- Return type:
None
- async p_to_c_ac(p_ac)[source]#
Return cDNA reference sequence accession from protein reference sequence accession (i.e.
p.
toc.
in HGVS syntax)- Parameters:
p_ac (
str
) – Protein accession- Return type:
List
[str
]- Returns:
List of rows containing c. accessions that are associated with the given p. accession. In ascending order.