cool_seq_tool.sources.uta_database#

Provide transcript lookup and metadata tools via the UTA database.

class cool_seq_tool.sources.uta_database.DbConnectionArgs(**data)[source]#

Represent database connection arguments

database: str[source]#
host: str[source]#
password: str[source]#
port: int[source]#
user: str[source]#
class cool_seq_tool.sources.uta_database.GenomicAlnData(**data)[source]#

Represent genomic alignment data from UTA tx_exon_aln_v view

alt_ac: Annotated[str][source]#
alt_end_i: Annotated[int][source]#
alt_start_i: Annotated[int][source]#
alt_strand: Strand[source]#
hgnc: Annotated[str][source]#
ord: Annotated[int][source]#
class cool_seq_tool.sources.uta_database.ParseResult(pr)[source]#

Subclass of url.ParseResult that adds database and schema methods, and provides stringification. Source: https://github.com/biocommons/hgvs

property database: str | None[source]#

Create database property.

property sanitized_url: str[source]#

Sanitized DB URL with the password masked

property schema: str | None[source]#

Create schema property.

class cool_seq_tool.sources.uta_database.TxExonAlnData(**data)[source]#

Represent data from UTA tx_exon_aln_v view

alt_aln_method: Annotated[str][source]#
alt_exon_id: Annotated[int][source]#
tx_ac: Annotated[str][source]#
tx_end_i: Annotated[int][source]#
tx_exon_id: Annotated[int][source]#
tx_start_i: Annotated[int][source]#
class cool_seq_tool.sources.uta_database.UtaDatabase(db_url=UTA_DB_URL)[source]#

Provide transcript lookup and metadata tools via the Universal Transcript Archive (UTA) database.

Users should use the create() method to construct a new instance. Note that almost all public methods are defined as async – see the Usage section for more information.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> uta_db = asyncio.run(UtaDatabase.create())
__init__(db_url=UTA_DB_URL)[source]#

Initialize DB class. Should only be used by create() method, and not be called directly by a user.

Parameters:

db_url (str) – PostgreSQL connection URL Format: driver://user:password@host/database/schema

async classmethod create(db_url=UTA_DB_URL)[source]#

Manufacture a fully-initialized class instance (a la factory pattern). This method should be used instead of calling the class directly to create a new instance.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> uta_db = asyncio.run(UtaDatabase.create())
Parameters:
  • cls – supplied implicitly

  • db_url (str) – PostgreSQL connection URL Format: driver://user:password@host/database/schema

Return type:

TypeVar(UTADatabaseType, bound= UtaDatabase)

Returns:

UTA DB access class instance

async create_pool()[source]#

Create connection pool if not already created.

Return type:

None

static data_from_result(result)[source]#

Return data found from result.

Parameters:

result (TxExonAlnData) – Transcript exon alignment data

Return type:

Optional[GenomicTxData]

Returns:

Aligned genomic / transcript exon data

async execute_query(query)[source]#

Execute a query and return its result.

Parameters:

query (str) – Query to make on database

Return type:

Any

Returns:

Query’s result

async gene_exists(gene)[source]#

Return whether or not a gene symbol exists in UTA gene table :rtype: bool

Parameters:

gene (str) – Gene symbol

:return True if gene symbol exists in UTA, False if not

async get_ac_descr(ac)[source]#

Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> async def describe():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_ac_descr("NC_000001.10")
...     return result
>>> asyncio.run(describe())
'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
Parameters:

ac (str) – chromosome accession, e.g. "NC_000001.10"

Return type:

Optional[str]

Returns:

Description containing assembly and chromosome

async get_ac_from_gene(gene)[source]#

Return genomic accession(s) associated to a gene.

Parameters:

gene (str) – Gene symbol

Return type:

list[str]

Returns:

List of genomic accessions, sorted in desc order

async get_alt_ac_start_or_end(tx_ac, tx_exon_start, tx_exon_end, gene)[source]#

Get genomic data for related transcript exon start or end.

Parameters:
  • tx_ac (str) – Transcript accession

  • tx_exon_start (int) – Transcript’s exon start coordinate

  • tx_exon_end (int) – Transcript’s exon end coordinate

  • gene (Optional[str]) – HGNC gene symbol

Return type:

tuple[Optional[GenomicAlnData], Optional[str]]

Returns:

Genomic alignment data and warnings if found

async get_cds_start_end(tx_ac)[source]#

Get coding start and end site

Parameters:

tx_ac (str) – Transcript accession

Return type:

Optional[tuple[int, int]]

Returns:

[Coding start site, Coding end site]

async get_chr_assembly(ac)[source]#

Get chromosome and assembly for NC accession if not in GRCh38.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> uta_db = asyncio.run(UtaDatabase.create())
>>> result = asyncio.run(uta_db.get_chr_assembly("NC_000007.13"))
>>> result
('chr7', <Assembly.GRCH37: 'GRCh37'>)

Returns None if unable to find (either unrecognized/invalid, or a GRCh38 accession).

Parameters:

ac (str) – RefSeq NC accession, eg "NC_000007.13"

Return type:

Optional[tuple[str, Assembly]]

Returns:

Chromosome and assembly that accession is on, if available.

async get_gene_from_ac(ac, start_pos, end_pos)[source]#

Get gene(s) within the provided coordinate range

>>> import asyncio
>>> from cool_seq_tool.sources import UtaDatabase
>>> async def get_gene():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_gene_from_ac(
...         "NC_000017.11", 43044296, 43045802
...     )
...     return result
>>> asyncio.run(get_gene())
['BRCA1']
Parameters:
  • ac (str) – NC accession, e.g. "NC_000001.11"

  • start_pos (int) – Start position change

  • end_pos (int) – End position change

Return type:

Optional[list[str]]

Returns:

List of HGNC gene symbols

async get_genomic_tx_data(tx_ac, pos, annotation_layer=AnnotationLayer.CDNA, alt_ac=None, target_genome_assembly=Assembly.GRCH38)[source]#

Get transcript mapping to genomic data.

Parameters:
  • tx_ac (str) – Accession on c. coordinate

  • pos (tuple[int, int]) – (start pos, end pos). These must describe the inter-residue coordinates that are being examined.

  • annotation_layer (Union[Literal[<AnnotationLayer.CDNA: 'c'>], Literal[<AnnotationLayer.GENOMIC: 'g'>]]) – Annotation layer for ac and pos

  • alt_ac (Optional[str]) – Accession on g. coordinate

  • target_genome_assembly (Assembly) – Genome assembly to get genomic data for. If alt_ac is provided, it will return the associated assembly.

Return type:

Optional[GenomicTxMetadata]

Returns:

Metadata for genomic and transcript accessions

async get_mane_c_genomic_data(ac, alt_ac, start_pos, end_pos)[source]#

Get MANE transcript and genomic data. Used when going from g. to MANE c. representation.

>>> import asyncio
>>> from cool_seq_tool.sources import UtaDatabase
>>> async def get_braf_mane():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_mane_c_genomic_data(
...         "NM_004333.6",
...         None,
...         140753335,
...         140753335,
...     )
...     return result
>>> braf = asyncio.run(get_braf_mane())
>>> braf["alt_ac"]
'NC_000007.14'
Parameters:
  • ac (str) – MANE transcript accession

  • alt_ac (Optional[str]) – NC accession. Used to triangulate on correct genomic data. Can be set to None if unavailable.

  • start_pos (int) – Genomic start position

  • end_pos (int) – Genomic end position change

Return type:

Optional[GenomicTxMetadata]

Returns:

Metadata for MANE genomic and transcript accessions results if successful

async get_newest_assembly_ac(ac)[source]#

Find accession associated to latest genomic assembly

Parameters:

ac (str) – Accession

Return type:

list[str]

Returns:

List of accessions associated to latest genomic assembly. Order by desc

static get_secret()[source]#

Get secrets for UTA DB instances. Used for deployment on AWS.

Raises:

ClientError – If unable to retrieve secret value due to decryption decryption failure, internal service error, invalid parameter, invalid request, or resource not found.

Return type:

str

async get_transcripts(start_pos=None, end_pos=None, gene=None, use_tx_pos=True, alt_ac=None)[source]#

Get transcripts for a given gene or alt_ac related to optional positions.

Parameters:
  • start_pos (Optional[int]) – Start position change If not provided and end_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • end_pos (Optional[int]) – End position change If not provided and start_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • gene (Optional[str]) – HGNC gene symbol

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are c. coordinate positions. False if querying on genomic position. This means start_pos and end_pos are g. coordinate positions

  • alt_ac (Optional[str]) – Genomic accession. If not provided, must provide gene

Return type:

DataFrame

Returns:

Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length

async get_transcripts_from_genomic_pos(alt_ac, g_pos)[source]#

Get transcripts associated to a genomic ac and position.

Parameters:
  • alt_ac (str) – Genomic accession

  • g_pos (int) – Genomic position

Return type:

list[str]

Returns:

RefSeq transcripts on c. coordinate

async get_tx_exon_aln_v_data(tx_ac, start_pos, end_pos, alt_ac=None, use_tx_pos=True, like_tx_ac=False)[source]#

Return queried data from tx_exon_aln_v table.

Parameters:
  • tx_ac (str) – accession on c. coordinate

  • start_pos (int) – Start position change

  • end_pos (int) – End position change

  • alt_ac (Optional[str]) – accession on g. coordinate

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are on the c. coordinate False if querying on genomic position. This means start_pos and end_pos are on the g. coordinate

  • like_tx_ac (bool) – True if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its version False if tx_condition will be exact match

Return type:

list[TxExonAlnData]

Returns:

List of transcript exon alignment data

async p_to_c_ac(p_ac)[source]#

Return cDNA reference sequence accession from protein reference sequence accession (i.e. p. to c. in HGVS syntax)

Parameters:

p_ac (str) – Protein accession

Return type:

list[str]

Returns:

List of rows containing c. accessions that are associated with the given p. accession. In ascending order.

async transcript_exists(transcript)[source]#

Return whether or not a transcript exists in the UTA tx_exon_aln_v table :rtype: bool

Parameters:

transcript (str) – A transcript accession

:return True if transcript exists in UTA, False if not

async validate_genomic_ac(ac)[source]#

Return whether or not genomic accession exists.

Parameters:

ac (str) – Genomic accession

Return type:

bool

Returns:

True if genomic accession exists. False otherwise.