cool_seq_tool.sources.uta_database#

Provide transcript lookup and metadata tools via the UTA database.

In an asyncio runtime:

>>> from cool_seq_tool.sources.uta_database import (
...     create_uta_connection_pool,
...     UtaDatabase,
... )
>>> pool = await create_uta_connection_pool()
>>> uta_db = UtaDatabase(pool)
>>> async with uta_db.repository() as uta:
...     braf_exists = await uta.gene_exists("BRAF")
>>> braf_exists
True

The class breakdown in this module is intended to reflect a repository pattern, where query/data transformation logic is defined in UtaRepository, and connection management/state/etc is constructed with create_uta_connection_pool().

The following is an example of how you might employ this in a FastAPI app:

from collections.abc import AsyncGenerator, Generator
from contextlib import asynccontextmanager
from typing import Annotated

from fastapi import FastAPI, Request, Depends
from cool_seq_tool.sources.uta_database import (
    UtaRepository,
    create_uta_connection_pool,
)

app = FastAPI()


@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator:
    uta_pool = await create_uta_connection_pool()
    app.state.uta_pool = uta_pool
    yield
    await uta_pool.close()


# dependency function
async def get_uta(request: Request) -> AsyncGenerator[UtaRepository, None, None]:
    async with request.app.state.uta_pool.connection() as conn:
        yield UtaRepository(conn)


@app.get("/check_gene_exists")
async def check_gene_exists(
    gene: str, uta: Annotated[UtaRepository, Depends(get_uta)]
):
    return await uta.gene_exists(gene)

To reflect the old pattern of constructing/managing connection state, the UtaDatabase class is provided. Its construction will look more like our previous UTA access patterns did, but individual queries should still be issued within a context manager. The repository() method is provided as a convenient means of doing so.

import asyncio

from cool_seq_tool.sources.uta_database import (
    create_uta_connection_pool,
    UtaDatabase,
)


async def example(gene: str):
    uta_db = UtaDatabase(connection)
    async with uta_db.repository() as uta:
        print(await uta.gene_exists("BRAF"))
    await uta_db.close()


asyncio.run(example("BRAF"))
exception cool_seq_tool.sources.uta_database.ClosedUtaConnectionError[source]#

Raise for attempts to access a UTA connection when it’s been closed/deleted

class cool_seq_tool.sources.uta_database.ExonCoord(**data)[source]#

Model for representing exon coordinate data

alt_end_i: Annotated[int][source]#
alt_start_i: Annotated[int][source]#
alt_strand: Strand[source]#
ord: Annotated[int][source]#
tx_end_i: Annotated[int][source]#
tx_start_i: Annotated[int][source]#
class cool_seq_tool.sources.uta_database.GenomicAlnData(**data)[source]#

Represent genomic alignment data from UTA tx_exon_aln_mv view

alt_ac: Annotated[str][source]#
alt_end_i: Annotated[int][source]#
alt_start_i: Annotated[int][source]#
alt_strand: Strand[source]#
hgnc: Annotated[str][source]#
ord: Annotated[int][source]#
class cool_seq_tool.sources.uta_database.LazyUtaDatabase(pool=None)[source]#

UTA access wrapper with lazy connection pool initialization.

This variant defers creation of the underlying connection pool until first use. It exists primarily for backward compatibility with earlier APIs that did not require explicit pool construction.

Because configuration is resolved at runtime (via environment variables or defaults), this class can introduce implicit behavior and is not recommended for applications that require explicit control over database connections.

__init__(pool=None)[source]#

Initialize the lazy access wrapper.

Parameters:

pool (Optional[AsyncConnectionPool]) – Optional existing async connection pool. If not provided, a pool will be created on first use using environment variables or default configuration.

async open()[source]#

Ensure that a connection pool has been initialized.

If no pool is currently set, one is created using default configuration.

Return type:

None

repository()[source]#

Yield a repository backed by a pooled UTA connection.

This method ensures that a connection pool exists, creating one if necessary, and then yields a UtaRepository bound to a checked-out connection.

Yield:

Repository bound to an active pooled connection

Return type:

AsyncGenerator[UtaRepository]

exception cool_seq_tool.sources.uta_database.NoMatchingAlignmentError[source]#

Raise for failure to find alignment matching user parameters

class cool_seq_tool.sources.uta_database.ParseResult(pr)[source]#

Subclass of url.ParseResult that adds database and schema methods, and provides stringification.

Inspired by: https://github.com/biocommons/hgvs

property database: str | None[source]#

Create database property.

property sanitized_url: str[source]#

Sanitized DB URL with the password masked

property schema: str | None[source]#

Create schema property.

class cool_seq_tool.sources.uta_database.TxExonAlnData(**data)[source]#

Represent data from UTA tx_exon_aln_mv view

alt_aln_method: Annotated[str][source]#
alt_exon_id: Annotated[int][source]#
tx_ac: Annotated[str][source]#
tx_end_i: Annotated[int][source]#
tx_exon_id: Annotated[int][source]#
tx_start_i: Annotated[int][source]#
class cool_seq_tool.sources.uta_database.UtaDatabase(pool)[source]#

Provide pooled access to connection-scoped UTA repositories.

This class owns or borrows an async psycopg connection pool and yields UtaRepository instances bound to checked-out connections.

__init__(pool)[source]#

Initialize access wrapper.

Parameters:

pool (AsyncConnectionPool) – Existing async connection pool to use. If omitted, a default pool is created lazily on first use.

async close()[source]#

Close the owned connection pool, if present.

Return type:

None

repository()[source]#

Yield a UtaRepository backed by a pooled connection.

If no pool has been provided yet, a default one is created on first use.

Yield:

Repository bound to an active pooled connection

Raises:

ClosedUtaConnectionError – if connection associated w/ this instance is closed or nullified

Return type:

AsyncGenerator[UtaRepository]

class cool_seq_tool.sources.uta_database.UtaRepository(conn)[source]#

Connection-scoped repository for issuing queries against UTA.

This class encapsulates predefined UTA queries and related result parsing. It operates on an active psycopg async connection provided at initialization time and does not manage connection lifecycle or pooling.

Instances are intended to be short-lived and used within the scope of a checked-out connection (e.g., from a connection pool).

__init__(conn)[source]#

Initialize the repository with an active database connection.

Parameters:

conn (AsyncConnection) – Active psycopg async connection to a UTA database. The caller is responsible for connection lifecycle management.

async execute_query(q, params=None)[source]#

Execute an arbitrary query against the UTA DB

This method is marked as public so that downstream applications can run custom queries using the same DB connection. However, that means they are responsible for managing the cursor themselves.

For the sake of compactness and separation of concerns, other modules in CoolSeqTool should avoid use of this method, and should instead add new methods to the repository class itself.

Parameters:
  • q (str) – raw query. May need to specify schema depending on connection context.

  • params (UnionType[Sequence, Mapping, None]) – query variables, if needed. These should not be hard-coded into the query.

Return type:

AsyncCursor

Returns:

query result cursor

Raises:

UndefinedTable – if queried table isn’t in the search_path – this likely indicates a UTA schema/search path config issue

async gene_exists(gene)[source]#

Return whether or not a gene symbol exists in UTA gene table

Parameters:

gene (str) – Gene symbol

Return type:

bool

Returns:

True if gene symbol exists in UTA, False if not

async get_ac_descr(ac)[source]#

Return free-text accession description

This is typically available only for accessions from older (pre-GRCh38) builds.

>>> async with uta.repository() as uta:
...     result = await uta.get_ac_descr("NC_000001.10")
>>> result
'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
Parameters:

ac (str) – chromosome accession, e.g. "NC_000001.10"

Return type:

Optional[str]

Returns:

Free-text description provided by source, generally containing assembly and chromosome

async get_ac_from_gene(gene)[source]#

Return genomic accession(s) associated to a gene.

Parameters:

gene (str) – Gene symbol

Return type:

list[str]

Returns:

List of genomic accessions, sorted in desc order

async get_all_exon_coords(tx_ac, genomic_ac=None)[source]#

Get all exon coordinate data for a transcript.

If genomic_ac is NOT provided, this method will use the GRCh38 accession associated to tx_ac.

Parameters:
  • tx_ac (str) – The RefSeq transcript accession to get exon data for.

  • genomic_ac (Optional[str]) – The RefSeq genomic accession to get exon data for.

Return type:

list[ExonCoord]

Returns:

List of all exon coordinate data for tx_ac and genomic_ac. The exon coordinate data will include the exon number, transcript and genomic positions for the start and end of the exon, and strand. The list will be ordered by ascending exon number.

async get_alt_ac_start_or_end(tx_ac, tx_exon_start, tx_exon_end, gene)[source]#

Get genomic data for related transcript exon start or end.

Parameters:
  • tx_ac (str) – Transcript accession

  • tx_exon_start (int) – Transcript’s exon start coordinate

  • tx_exon_end (int) – Transcript’s exon end coordinate

  • gene (Optional[str]) – HGNC gene symbol, if available

Return type:

GenomicAlnData

Returns:

Genomic alignment data if match found

Raises:

NoMatchingAlignmentError – if unable to find alignment matching given params

async get_alt_acs_for_tx(tx_ac)[source]#

Return genomic reference sequences associated with transcript accession

Parameters:

tx_ac (str) – transcript accession

Return type:

list[str]

Returns:

list of genomic accessions for which alignments exist to tx_ac

async get_cds_start_end(tx_ac)[source]#

Return CDS start/end coordinates for a transcript.

Strips version from Ensembl accessions (ENS*) since UTA stores them unversioned.

Parameters:

tx_ac (str) – Transcript accession

Return type:

Optional[tuple[int, int]]

Returns:

(cds_start_i, cds_end_i) if both exist, else None

async get_chr_assembly(ac)[source]#

Get chromosome and assembly for NC accession if not in GRCh38.

>>> async with uta_db.repository() as uta:
...     result = await uta.get_chr_assembly("NC_000007.13")
>>> result
('chr7', <Assembly.GRCH37: 'GRCh37'>)

Returns None if unable to find (either unrecognized/invalid, or a GRCh38 accession).

Parameters:

ac (str) – RefSeq NC accession, eg "NC_000007.13"

Return type:

Optional[tuple[str, Assembly]]

Returns:

Chromosome and assembly that accession is on, if available.

async get_gene_from_ac(ac, start_pos, end_pos)[source]#

Get gene(s) within the provided coordinate range

>>> async with uta_db.repository() as uta:
...     result = await uta.get_gene_from_ac("NC_000017.11", 43044296, 43045802)
>>> result
['BRCA1']

This function performs a relatively expensive condition check and is expected to be relatively slow (~100ms under ideal conditions). If users need a more efficient lookup of this form, they should either create their own materialized views/indices, or lobby for their inclusion in a new UTA release.

Parameters:
  • ac (str) – NC accession, e.g. "NC_000001.11"

  • start_pos (int) – Start position change

  • end_pos (Optional[int]) – End position change

Return type:

Optional[list[str]]

Returns:

List of HGNC gene symbols

async get_gene_from_tx_ac(tx_ac)[source]#

Return HGNC gene name for a transcript accession

Parameters:

tx_ac (str) – transcript accession

Return type:

Optional[str]

Returns:

gene name, if found

async get_genomic_tx_data(tx_ac, pos, annotation_layer=AnnotationLayer.CDNA, alt_ac=None, target_genome_assembly=Assembly.GRCH38)[source]#

Get transcript mapping to genomic data.

Parameters:
  • tx_ac (str) – Accession on c. coordinate

  • pos (tuple[int, int]) – (start pos, end pos). These must describe the inter-residue coordinates that are being examined.

  • annotation_layer (Union[Literal[<AnnotationLayer.CDNA: 'c'>], Literal[<AnnotationLayer.GENOMIC: 'g'>]]) – Annotation layer for ac and pos

  • alt_ac (Optional[str]) – Accession on g. coordinate

  • target_genome_assembly (Assembly) – Genome assembly to get genomic data for. If alt_ac is provided, it will return the associated assembly.

Return type:

Optional[GenomicTxMetadata]

Returns:

Metadata for genomic and transcript accessions

async get_mane_c_genomic_data(ac, alt_ac, start_pos, end_pos)[source]#

Get MANE transcript and genomic data. Used when going from g. to MANE c. representation. This function parses queried data from the tx_exon_aln_mv table, and sorts the queried data by the most recent genomic build

>>> async with uta_db.repository() as uta:
...     result = await uta.get_mane_c_genomic_data(
...         "NM_004333.6",
...         None,
...         140753335,
...         140753335,
...     )
>>> result.alt_ac
'NC_000007.14'
Parameters:
  • ac (str) – MANE transcript accession

  • alt_ac (Optional[str]) – NC accession. Used to triangulate on correct genomic data. Can be set to None if unavailable.

  • start_pos (int) – Genomic start position

  • end_pos (int) – Genomic end position change

Return type:

Optional[GenomicTxMetadata]

Returns:

Metadata for MANE genomic and transcript accessions results if successful

async get_newest_assembly_ac(ac)[source]#

Return newest accession versions matching the given prefix

If the accession is Ensembl (EN prefix), results are ordered lexicographically. Otherwise, RefSeq-style accessions are ordered by version number in descending order.

Parameters:

ac (str) – Accession (versioned or unversioned)

Return type:

list[str]

Returns:

List of matching accessions, newest version first

async get_transcripts(start_pos=None, end_pos=None, gene=None, use_tx_pos=True, alt_ac=None)[source]#

Get transcripts for a given gene or alt_ac related to optional positions.

Parameters:
  • start_pos (Optional[int]) – Start position change If not provided and end_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • end_pos (Optional[int]) – End position change If not provided and start_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • gene (Optional[str]) – HGNC gene symbol

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are c. coordinate positions. False if querying on genomic position. This means start_pos and end_pos are g. coordinate positions

  • alt_ac (Optional[str]) – Genomic accession. If not provided, must provide gene

Return type:

DataFrame

Returns:

Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length

async get_transcripts_from_genomic_pos(alt_ac, g_pos)[source]#

Get transcripts associated to a genomic ac and position.

Parameters:
  • alt_ac (str) – Genomic accession

  • g_pos (int) – Genomic position

Return type:

list[str]

Returns:

RefSeq transcripts on c. coordinate

async get_tx_exon_aln_data(tx_ac, start_pos, end_pos, alt_ac=None, use_tx_pos=True, like_tx_ac=False)[source]#

Get alignments between exons and reference sequences.

This is a direct query against the UTA tx_exon_aln_mv view.

Parameters:
  • tx_ac (str) – accession on c. coordinate

  • start_pos (int) – Start position change

  • end_pos (int) – End position change

  • alt_ac (Optional[str]) – accession on g. coordinate

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are on the c. coordinate False if querying on genomic position. This means start_pos and end_pos are on the g. coordinate

  • like_tx_ac (bool) – True if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its version False if tx_condition will be exact match

Return type:

list[TxExonAlnData]

Returns:

List of transcript exon alignment data

async p_to_c_ac(p_ac)[source]#

Return cDNA reference sequence accession from protein reference sequence accession (i.e. p. to c. in HGVS syntax)

Parameters:

p_ac (str) – Protein accession

Return type:

list[str]

Returns:

List of rows containing c. accessions that are associated with the given p. accession. In ascending order.

async transcript_exists(transcript)[source]#

Return whether or not a transcript exists in the UTA tx_exon_aln_mv table

Parameters:

transcript (str) – A transcript accession

Return type:

bool

Returns:

True if transcript exists in UTA, False if not

async validate_genomic_ac(ac)[source]#

Return whether or not genomic accession exists.

Parameters:

ac (str) – Genomic accession

Return type:

bool

Returns:

True if genomic accession exists. False otherwise.

async validate_genomic_breakpoint(pos, genomic_ac, tx_ac)[source]#
Validate that a genomic coordinate falls within the first and last exon

for a transcript on a given accession

Parameters:
  • pos (int) – Genomic position on genomic_ac

  • genomic_ac (str) – RefSeq genomic accession, e.g. "NC_000007.14"

  • transcript – A transcript accession

Return type:

bool

Returns:

True if the coordinate falls within 150bp of the first and last exon for the transcript, False if not. Breakpoints past this threshold are likely erroneous.

async cool_seq_tool.sources.uta_database.create_uta_connection_pool(db_url=None)[source]#

Create and initialize a UTA connection pool.

Connection parameters are resolved in the following order:

  1. If the UTA_DB_PROD environment variable is set, credentials and schema are retrieved from a secret manager via _get_secret_args().

  2. Otherwise, if the db_url arg is defined, it’s used

  3. If not provided, fall back to environment variable UTA_DB_URL

  4. If not declared, then use default value

Connection strings are expected to look like this:

Note

Connection strings are expected to look like this:

postgresql://user@host:port/db?options=-csearch_path%3Duta_schema,public

For example, this is the default:

postgresql://anonymous@localhost:5432/uta?options=-csearch_path%3Duta_20241220,public

However, biocommons-style connection strings are presently supported, although they are considered deprecated:

postgresql://anonymous@localhost:5432/uta/uta_20241220

Parameters:

db_url (Optional[str]) – PostgreSQL connection URI If not provided, resolved from environment or defaults.

Return type:

AsyncConnectionPool

Returns:

An open AsyncConnectionPool configured for the UTA database