cool_seq_tool.sources.uta_database#

Provide transcript lookup and metadata tools via the UTA database.

class cool_seq_tool.sources.uta_database.ParseResult(pr)[source]#

Subclass of url.ParseResult that adds database and schema methods, and provides stringification. Source: https://github.com/biocommons/hgvs

property database: str | None[source]#

Create database property.

property schema: str | None[source]#

Create schema property.

class cool_seq_tool.sources.uta_database.UtaDatabase(db_url=UTA_DB_URL, chain_file_37_to_38=None, chain_file_38_to_37=None)[source]#

Provide transcript lookup and metadata tools via the Universal Transcript Archive (UTA) database.

Users should use the create() method to construct a new instance. Note that almost all public methods are defined as async – see the Usage section for more information.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> uta_db = asyncio.run(UtaDatabase.create())
__init__(db_url=UTA_DB_URL, chain_file_37_to_38=None, chain_file_38_to_37=None)[source]#

Initialize DB class. Should only be used by create() method, and not be called directly by a user.

Parameters:
  • db_url (str) – PostgreSQL connection URL Format: driver://user:password@host/database/schema

  • chain_file_37_to_38 (Optional[str]) – Optional path to chain file for 37 to 38 assembly. This is used for agct. If this is not provided, will check to see if LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow agct to download a chain file from UCSC

  • chain_file_38_to_37 (Optional[str]) – Optional path to chain file for 38 to 37 assembly. This is used for agct. If this is not provided, will check to see if LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow agct to download a chain file from UCSC

async classmethod create(db_url=UTA_DB_URL)[source]#

Manufacture a fully-initialized class instance (a la factory pattern). This method should be used instead of calling the class directly to create a new instance.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> uta_db = asyncio.run(UtaDatabase.create())
Parameters:
  • cls – supplied implicitly

  • db_url (str) – PostgreSQL connection URL Format: driver://user:password@host/database/schema

Return type:

TypeVar(UTADatabaseType, bound= UtaDatabase)

Returns:

UTA DB access class instance

async create_pool()[source]#

Create connection pool if not already created.

Return type:

None

static data_from_result(result)[source]#

Return data found from result.

Parameters:

result (List) – Data from tx_exon_aln_v table

Return type:

Optional[Dict]

Returns:

Gene, strand, and position ranges for tx and alt_ac

async execute_query(query)[source]#

Execute a query and return its result.

Parameters:

query (str) – Query to make on database

Return type:

Any

Returns:

Query’s result

async get_ac_descr(ac)[source]#

Return accession description. This is typically available only for accessions from older (pre-GRCh38) builds.

>>> import asyncio
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
>>> async def describe():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_ac_descr("NC_000001.10")
...     return result
>>> asyncio.run(describe())
'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
Parameters:

ac (str) – chromosome accession, e.g. "NC_000001.10"

Return type:

Optional[str]

Returns:

Description containing assembly and chromosome

async get_ac_from_gene(gene)[source]#

Return genomic accession(s) associated to a gene.

Parameters:

gene (str) – Gene symbol

Return type:

List[str]

Returns:

List of genomic accessions, sorted in desc order

async get_alt_ac_start_or_end(tx_ac, tx_exon_start, tx_exon_end, gene)[source]#

Get genomic data for related transcript exon start or end.

Parameters:
  • tx_ac (str) – Transcript accession

  • tx_exon_start (int) – Transcript’s exon start coordinate

  • tx_exon_end (int) – Transcript’s exon end coordinate

  • gene (Optional[str]) – HGNC gene symbol

Return type:

Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]

Returns:

[hgnc symbol, genomic accession for chromosome, aligned genomic start coordinate, aligned genomic end coordinate, strand], and warnings if found

async get_cds_start_end(tx_ac)[source]#

Get coding start and end site

Parameters:

tx_ac (str) – Transcript accession

Return type:

Optional[Tuple[int, int]]

Returns:

[Coding start site, Coding end site]

async get_chr_assembly(ac)[source]#

Get chromosome and assembly for NC accession if not in GRCh38.

Parameters:

ac (str) – NC accession

Return type:

Optional[Tuple[str, str]]

Returns:

Chromosome and Assembly accession is on

async get_gene_from_ac(ac, start_pos, end_pos)[source]#

Get gene(s) within the provided coordinate range

>>> import asyncio
>>> from cool_seq_tool.sources import UtaDatabase
>>> async def get_gene():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802)
...     return result
>>> asyncio.run(get_gene())
['BRCA1']
Parameters:
  • ac (str) – NC accession, e.g. "NC_000001.11"

  • start_pos (int) – Start position change

  • end_pos (int) – End position change

Return type:

Optional[List[str]]

Returns:

List of HGNC gene symbols

async get_genes_and_alt_acs(pos, strand=None, chromosome=None, alt_ac=None, gene=None)[source]#

Return genes and genomic accessions for a position on a chromosome or alt_ac

Parameters:
  • pos (int) – Genomic position

  • strand (Optional[Strand]) – Strand

  • chromosome (Optional[int]) – Chromosome. Must give chromosome without a prefix (i.e. 1 or X). If not provided, must provide alt_ac. If alt_ac is also provided, alt_ac will be used.

  • alt_ac (Optional[str]) – Genomic accession (i.e. NC_000001.11). If not provided, must provide chromosome. If ``chromosome is also provided, alt_ac will be used.

  • gene (Optional[str]) – Gene symbol

Return type:

Tuple[Optional[Dict], Optional[str]]

Returns:

Dictionary containing genes and genomic accessions and warnings if found

async get_genomic_tx_data(tx_ac, pos, annotation_layer=AnnotationLayer.CDNA, alt_ac=None, target_genome_assembly=Assembly.GRCH38)[source]#

Get transcript mapping to genomic data.

Parameters:
  • tx_ac (str) – Accession on c. coordinate

  • pos (Tuple[int, int]) – (start pos, end pos)

  • annotation_layer (Union[c, g]) – Annotation layer for ac and pos

  • alt_ac (Optional[str]) – Accession on g. coordinate

  • target_genome_assembly (Assembly) – Genome assembly to get genomic data for. If alt_ac is provided, it will return the associated assembly.

Return type:

Optional[Dict]

Returns:

Gene, Transcript accession and position change, Altered transcript accession and position change, Strand

get_liftover(chromosome, pos, liftover_to_assembly)[source]#

Get new genome assembly data for a position on a chromosome.

Parameters:
  • chromosome (str) – The chromosome number. Must be prefixed with chr

  • pos (int) – Position on the chromosome

  • liftover_to_assembly (Assembly) – Assembly to liftover to

Return type:

Optional[Tuple[str, int]]

Returns:

Target chromosome and target position for assembly

async get_mane_c_genomic_data(ac, alt_ac, start_pos, end_pos)[source]#

Get MANE transcript and genomic data. Used when going from g. to MANE c. representation.

>>> import asyncio
>>> from cool_seq_tool.sources import UtaDatabase
>>> async def get_braf_mane():
...     uta_db = await UtaDatabase.create()
...     result = await uta_db.get_mane_c_genomic_data(
...         "NM_004333.6",
...         None,
...         140753335,
...         140753335,
...     )
...     return result
>>> braf = asyncio.run(get_braf_mane())
>>> braf["alt_ac"]
'NC_000007.14'
Parameters:
  • ac (str) – MANE transcript accession

  • alt_ac (Optional[str]) – NC accession. Used to triangulate on correct genomic data. Can be set to None if unavailable.

  • start_pos (int) – Genomic start position

  • end_pos (int) – Genomic end position change

Return type:

Optional[Dict]

Returns:

MANE transcript results if successful

async get_newest_assembly_ac(ac)[source]#

Find accession associated to latest genomic assembly

Parameters:

ac (str) – Accession

Return type:

List[str]

Returns:

List of accessions associated to latest genomic assembly. Order by desc

static get_secret()[source]#

Get secrets for UTA DB instances. Used for deployment on AWS.

Return type:

str

async get_transcripts(start_pos=None, end_pos=None, gene=None, use_tx_pos=True, alt_ac=None)[source]#

Get transcripts for a given gene or alt_ac related to optional positions.

Parameters:
  • start_pos (Optional[int]) – Start position change If not provided and end_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • end_pos (Optional[int]) – End position change If not provided and start_pos not provided, all transcripts associated with the gene and/or accession will be returned

  • gene (Optional[str]) – HGNC gene symbol

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are c. coordinate positions. False if querying on genomic position. This means start_pos and end_pos are g. coordinate positions

  • alt_ac (Optional[str]) – Genomic accession. If not provided, must provide gene

Return type:

DataFrame

Returns:

Data Frame containing transcripts associated with a gene. Transcripts are ordered by most recent NC accession, then by descending transcript length

async get_transcripts_from_genomic_pos(alt_ac, g_pos)[source]#

Get transcripts associated to a genomic ac and position.

Parameters:
  • alt_ac (str) – Genomic accession

  • g_pos (int) – Genomic position

Return type:

List[str]

Returns:

RefSeq transcripts on c. coordinate

async get_tx_exon_aln_v_data(tx_ac, start_pos, end_pos, alt_ac=None, use_tx_pos=True, like_tx_ac=False)[source]#

Return queried data from tx_exon_aln_v table.

Parameters:
  • tx_ac (str) – accession on c. coordinate

  • start_pos (int) – Start position change

  • end_pos (int) – End position change

  • alt_ac (Optional[str]) – accession on g. coordinate

  • use_tx_pos (bool) – True if querying on transcript position. This means start_pos and end_pos are on the c. coordinate False if querying on genomic position. This means start_pos and end_pos are on the g. coordinate

  • like_tx_ac (bool) – True if tx_ac condition should be a like statement. This is used when you want to query an accession regardless of its version False if tx_condition will be exact match

Return type:

List

Returns:

List of tx_exon_aln_v data

async get_tx_exons(tx_ac, alt_ac=None)[source]#

Get list of transcript exons start/end coordinates.

Parameters:
  • tx_ac (str) – Transcript accession

  • alt_ac (Optional[str]) – Genomic accession

Return type:

Tuple[Optional[List[Tuple[int, int]]], Optional[str]]

Returns:

List of a transcript’s accessions and warnings if found

async get_tx_exons_genomic_coords(tx_ac, alt_ac)[source]#

Get exon number, transcript coordinates, and genomic coordinates

Parameters:
  • tx_ac (str) – Transcript accession

  • alt_ac (str) – RefSeq genomic accession

Return type:

Tuple[Optional[Tuple[int, int, int, int, int]], Optional[str]]

Returns:

Tuple of exon numbers, transcript and genomic coordinates, and warnings if found

async liftover_to_38(genomic_tx_data)[source]#

Liftover genomic_tx_data to hg38 assembly.

Parameters:

genomic_tx_data (Dict) – Dictionary containing gene, nc_accession, alt_pos, and strand

Return type:

None

async p_to_c_ac(p_ac)[source]#

Return cDNA reference sequence accession from protein reference sequence accession (i.e. p. to c. in HGVS syntax)

Parameters:

p_ac (str) – Protein accession

Return type:

List[str]

Returns:

List of rows containing c. accessions that are associated with the given p. accession. In ascending order.

async validate_genomic_ac(ac)[source]#

Return whether or not genomic accession exists.

Parameters:

ac (str) – Genomic accession

Return type:

bool

Returns:

True if genomic accession exists. False otherwise.