cool_seq_tool.sources.uta_database#
Provide transcript lookup and metadata tools via the UTA database.
- class cool_seq_tool.sources.uta_database.DbConnectionArgs(**data)[source]#
Represent database connection arguments
- class cool_seq_tool.sources.uta_database.GenomicAlnData(**data)[source]#
Represent genomic alignment data from UTA tx_exon_aln_v view
- class cool_seq_tool.sources.uta_database.ParseResult(pr)[source]#
Subclass of url.ParseResult that adds database and schema methods, and provides stringification. Source: https://github.com/biocommons/hgvs
- class cool_seq_tool.sources.uta_database.TxExonAlnData(**data)[source]#
Represent data from UTA tx_exon_aln_v view
- class cool_seq_tool.sources.uta_database.UtaDatabase(db_url=UTA_DB_URL)[source]#
Provide transcript lookup and metadata tools via the Universal Transcript Archive (UTA) database.
Users should use the
create()method to construct a new instance. Note that almost all public methods are defined asasync– see the Usage section for more information.>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> uta_db = asyncio.run(UtaDatabase.create())
- __init__(db_url=UTA_DB_URL)[source]#
Initialize DB class. Should only be used by
create()method, and not be called directly by a user.- Parameters:
db_url (
str) – PostgreSQL connection URL Format:driver://user:password@host/database/schema
- async classmethod create(db_url=UTA_DB_URL)[source]#
Manufacture a fully-initialized class instance (a la factory pattern). This method should be used instead of calling the class directly to create a new instance.
>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> uta_db = asyncio.run(UtaDatabase.create())
- Parameters:
cls – supplied implicitly
db_url (
str) – PostgreSQL connection URL Format:driver://user:password@host/database/schema
- Return type:
TypeVar(UTADatabaseType, bound= UtaDatabase)- Returns:
UTA DB access class instance
- static data_from_result(result)[source]#
Return data found from result.
- Parameters:
result (
TxExonAlnData) – Transcript exon alignment data- Return type:
Optional[GenomicTxData]- Returns:
Aligned genomic / transcript exon data
- async execute_query(query)[source]#
Execute a query and return its result.
- Parameters:
query (
str) – Query to make on database- Return type:
Any- Returns:
Query’s result
- async gene_exists(gene)[source]#
Return whether or not a gene symbol exists in UTA gene table :rtype:
bool- Parameters:
gene (
str) – Gene symbol
:return
Trueif gene symbol exists in UTA,Falseif not
- async get_ac_descr(ac)[source]#
Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds.
>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> async def describe(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_ac_descr("NC_000001.10") ... return result >>> asyncio.run(describe()) 'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
- Parameters:
ac (
str) – chromosome accession, e.g."NC_000001.10"- Return type:
Optional[str]- Returns:
Description containing assembly and chromosome
- async get_ac_from_gene(gene)[source]#
Return genomic accession(s) associated to a gene.
- Parameters:
gene (
str) – Gene symbol- Return type:
list[str]- Returns:
List of genomic accessions, sorted in desc order
- async get_alt_ac_start_or_end(tx_ac, tx_exon_start, tx_exon_end, gene)[source]#
Get genomic data for related transcript exon start or end.
- Parameters:
tx_ac (
str) – Transcript accessiontx_exon_start (
int) – Transcript’s exon start coordinatetx_exon_end (
int) – Transcript’s exon end coordinategene (
Optional[str]) – HGNC gene symbol
- Return type:
tuple[Optional[GenomicAlnData],Optional[str]]- Returns:
Genomic alignment data and warnings if found
- async get_cds_start_end(tx_ac)[source]#
Get coding start and end site
- Parameters:
tx_ac (
str) – Transcript accession- Return type:
Optional[tuple[int,int]]- Returns:
[Coding start site, Coding end site]
- async get_chr_assembly(ac)[source]#
Get chromosome and assembly for NC accession if not in GRCh38.
>>> import asyncio >>> from cool_seq_tool.sources.uta_database import UtaDatabase >>> uta_db = asyncio.run(UtaDatabase.create()) >>> result = asyncio.run(uta_db.get_chr_assembly("NC_000007.13")) >>> result ('chr7', <Assembly.GRCH37: 'GRCh37'>)
Returns
Noneif unable to find (either unrecognized/invalid, or a GRCh38 accession).- Parameters:
ac (
str) – RefSeq NC accession, eg"NC_000007.13"- Return type:
Optional[tuple[str,Assembly]]- Returns:
Chromosome and assembly that accession is on, if available.
- async get_gene_from_ac(ac, start_pos, end_pos)[source]#
Get gene(s) within the provided coordinate range
>>> import asyncio >>> from cool_seq_tool.sources import UtaDatabase >>> async def get_gene(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_gene_from_ac( ... "NC_000017.11", 43044296, 43045802 ... ) ... return result >>> asyncio.run(get_gene()) ['BRCA1']
- Parameters:
ac (
str) – NC accession, e.g."NC_000001.11"start_pos (
int) – Start position changeend_pos (
int) – End position change
- Return type:
Optional[list[str]]- Returns:
List of HGNC gene symbols
- async get_genomic_tx_data(tx_ac, pos, annotation_layer=AnnotationLayer.CDNA, alt_ac=None, target_genome_assembly=Assembly.GRCH38)[source]#
Get transcript mapping to genomic data.
- Parameters:
tx_ac (
str) – Accession on c. coordinatepos (
tuple[int,int]) – (start pos, end pos). These must describe the inter-residue coordinates that are being examined.annotation_layer (
Union[Literal[<AnnotationLayer.CDNA: 'c'>],Literal[<AnnotationLayer.GENOMIC: 'g'>]]) – Annotation layer foracandposalt_ac (
Optional[str]) – Accession on g. coordinatetarget_genome_assembly (
Assembly) – Genome assembly to get genomic data for. Ifalt_acis provided, it will return the associated assembly.
- Return type:
Optional[GenomicTxMetadata]- Returns:
Metadata for genomic and transcript accessions
- async get_mane_c_genomic_data(ac, alt_ac, start_pos, end_pos)[source]#
Get MANE transcript and genomic data. Used when going from g. to MANE c. representation. This function parses queried data from the tx_exon_aln_v table, and sorts the queried data by the most recent genomic build
>>> import asyncio >>> from cool_seq_tool.sources import UtaDatabase >>> async def get_braf_mane(): ... uta_db = await UtaDatabase.create() ... result = await uta_db.get_mane_c_genomic_data( ... "NM_004333.6", ... None, ... 140753335, ... 140753335, ... ) ... return result >>> braf = asyncio.run(get_braf_mane()) >>> braf["alt_ac"] 'NC_000007.14'
- Parameters:
ac (
str) – MANE transcript accessionalt_ac (
Optional[str]) – NC accession. Used to triangulate on correct genomic data. Can be set toNoneif unavailable.start_pos (
int) – Genomic start positionend_pos (
int) – Genomic end position change
- Return type:
Optional[GenomicTxMetadata]- Returns:
Metadata for MANE genomic and transcript accessions results if successful
- async get_newest_assembly_ac(ac)[source]#
Find accession associated to latest genomic assembly
- Parameters:
ac (
str) – Accession- Return type:
list[str]- Returns:
List of accessions associated to latest genomic assembly. Order by desc
- static get_secret()[source]#
Get secrets for UTA DB instances. Used for deployment on AWS.
- Raises:
ClientError – If unable to retrieve secret value due to decryption decryption failure, internal service error, invalid parameter, invalid request, or resource not found.
- Return type:
str
- async get_transcripts(start_pos=None, end_pos=None, gene=None, use_tx_pos=True, alt_ac=None)[source]#
Get transcripts for a given
geneoralt_acrelated to optional positions.- Parameters:
start_pos (
Optional[int]) – Start position change If not provided andend_posnot provided, all transcripts associated with the gene and/or accession will be returnedend_pos (
Optional[int]) – End position change If not provided andstart_posnot provided, all transcripts associated with the gene and/or accession will be returnedgene (
Optional[str]) – HGNC gene symboluse_tx_pos (
bool) –Trueif querying on transcript position. This meansstart_posandend_posare c. coordinate positions.Falseif querying on genomic position. This meansstart_posandend_posare g. coordinate positionsalt_ac (
Optional[str]) – Genomic accession. If not provided, must providegene
- Return type:
DataFrame- Returns:
Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length
- async get_transcripts_from_genomic_pos(alt_ac, g_pos)[source]#
Get transcripts associated to a genomic ac and position.
- Parameters:
alt_ac (
str) – Genomic accessiong_pos (
int) – Genomic position
- Return type:
list[str]- Returns:
RefSeq transcripts on c. coordinate
- async get_tx_exon_aln_v_data(tx_ac, start_pos, end_pos, alt_ac=None, use_tx_pos=True, like_tx_ac=False)[source]#
Return queried data from tx_exon_aln_v table.
- Parameters:
tx_ac (
str) – accession on c. coordinatestart_pos (
int) – Start position changeend_pos (
int) – End position changealt_ac (
Optional[str]) – accession on g. coordinateuse_tx_pos (
bool) –Trueif querying on transcript position. This meansstart_posandend_posare on the c. coordinateFalseif querying on genomic position. This meansstart_posandend_posare on the g. coordinatelike_tx_ac (
bool) –Trueif tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its versionFalseif tx_condition will be exact match
- Return type:
list[TxExonAlnData]- Returns:
List of transcript exon alignment data
- async p_to_c_ac(p_ac)[source]#
Return cDNA reference sequence accession from protein reference sequence accession (i.e.
p.toc.in HGVS syntax)- Parameters:
p_ac (
str) – Protein accession- Return type:
list[str]- Returns:
List of rows containing c. accessions that are associated with the given p. accession. In ascending order.