Source code for BRAD.gene_ontology

"""

Gene Ontology (GO)
------------------

This module provides functions to perform `Gene Ontology (GO) <https://geneontology.org/>`_ searches, download charts, and retrieve associated publications and annotations based on gene terms. The module interacts with external APIs, such as QuickGO and PubMed, to gather the relevant information.

Available Methods
~~~~~~~~~~~~~~~~~

This module has the following methods:

"""

import requests, sys
import pandas as pd
import json
import csv
from requests_html import HTMLSession
import requests
from requests.exceptions import ConnectionError
import os
import copy
from io import StringIO

from BRAD import log

[docs] def geneOntology(state, goQuery): """ Performs Gene Ontology (GO) search for specified genes and updates the chat status with the results. :param goQuery: The query string containing gene names or terms for GO search. :type goQuery: str :param state: The current status of the chat, including the prompt, configuration, and process details. :type state: dict :raises FileNotFoundError: If the gene list file is not found. :return: The updated chat status dictionary containing the GO search results and process details. :rtype: dict """ # Auth: Marc Choi # machoi@umich.edu current_script_path = os.path.abspath(__file__) current_script_dir = os.path.dirname(current_script_path) file_path = os.path.join(current_script_dir, 'helperData', 'gene_list.txt') with open(file_path, 'r') as file: contents = file.read() gene_list = contents.split('\n') if len(goQuery) > 0: go_process = goSearch(goQuery, state) state['process']['GO'] = go_process return state
[docs] def goSearch(query, state): """ Performs a search on Gene Ontology (GO) based on the provided query and allows downloading associated charts and papers. :param query: The query list containing gene names or terms for GO search. :type query: list :return: A dictionary containing the GO search process details. :rtype: dict """ # Auth: Marc Choi # machoi@umich.edu process = {} output = {} for terms in query: output, geneStatus = textGO(terms, state) process['output'] = output if geneStatus == True: state = log.userOutput('\n would you like to download charts associated with these genes [Y/N]?', state=state) for term in output: go_id = str(term[0]) chartGO(go_id, state) state = log.userOutput('\n would you like to download the paper associated with these genes [Y/N]?', state=state) # download2 = input().strip().upper() # process['paper_download'] = (download2 == 'Y') # if download2 == 'Y': pubmedPaper(go_id, state) else: state = log.userOutput('\n would you like to download the gene product annotation [Y/N]?', state=state) for term in query: state = log.userOutput(term, state=state) annotations(term, state) return process
[docs] def textGO(query, state): """ Performs a text-based Gene Ontology (GO) search for a specified query and returns the extracted data and gene status. :param query: The query string containing gene names or terms for GO search. :type query: str :raises requests.HTTPError: If the HTTP request to the GO API fails. :return: A tuple containing the extracted data and a boolean indicating whether the query is a gene. :rtype: tuple """ # Auth: Marc Choi # machoi@umich.edu gene = True requestURL = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/search?query="+query+"&limit=25&page=1" r = requests.get(requestURL, headers={ "Accept" : "application/json"}) if not r.ok: r.raise_for_status() sys.exit() extracted_data = [] responseBody = r.text data = json.loads(responseBody) if data['numberOfHits'] == 0: gene=False state = log.userOutput("No Gene Ontology Available - Searching Gene Products", state=state) requestURL = requestURL = "https://www.ebi.ac.uk/QuickGO/services/geneproduct/search?query="+query r = requests.get(requestURL, headers={ "Accept" : "application/json"}) if not r.ok: r.raise_for_status() sys.exit() responseBody = r.text data = json.loads(responseBody) for result in data['results']: id = result['id'] extracted_data.append(id) for id in extracted_data: state = log.userOutput(f"ID: {id}", state=state) else: for result in data['results']: id = result['id'] text = result['definition']['text'] extracted_data.append((id, text)) # Print the extracted data for id, text in extracted_data: state = log.userOutput(f"ID: {id}", state=state) state = log.userOutput(f"Text: {text}", state=state) return extracted_data, gene
#Input is a GO:----- identification for a gene
[docs] def chartGO(identifier, state): """ Downloads a chart for a specified Gene Ontology (GO) identifier. :param identifier: The GO identifier for which the chart is to be downloaded. :type identifier: str :raises requests.HTTPError: If the HTTP request to download the chart fails. """ # Auth: Marc Choi # machoi@umich.edu try: path = os.path.abspath(os.getcwd()) + '/go_charts' os.makedirs(path, exist_ok = True) state = log.userOutput("Directory '%s' created successfully" % path, state=state) except OSError as error: state = log.userOutput("Directory '%s' can not be created" % path, state=state) requestURL = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{ids}/chart?ids=GO%3A"+identifier[3:] img_data = requests.get(requestURL).content # save chart with open(os.path.join(path, identifier[3:] + '.jpg'), 'wb') as handler: handler.write(img_data)
#Input is a GO:----- identification for a gene
[docs] def pubmedPaper(identifier, state): """ Downloads PubMed papers associated with a specified Gene Ontology (GO) identifier. :param identifier: The GO identifier for which the associated PubMed papers are to be downloaded. :type identifier: str :raises requests.HTTPError: If the HTTP request to the PubMed API fails. """ # Auth: Marc Choi # machoi@umich.edu requestURL2 = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/GO%3A"+identifier[3:] r = requests.get(requestURL2, headers={ "Accept" : "application/json"}) if not r.ok: r.raise_for_status() sys.exit() responseBody = r.text data = json.loads(responseBody) # Extract the dbId from the first result dbId = [] #state = log.userOutput(data, state=state) if data['numberOfHits'] > 0: xrefs = data['results'][0].get('definition', {}).get('xrefs', []) if xrefs: for db in xrefs: dbId.append(db['dbId']) s = HTMLSession() headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'} if len(dbId) > 0: try: path = os.path.abspath(os.getcwd()) + '/specialized_docs' os.makedirs(path, exist_ok = True) state = log.userOutput("Directory '%s' created successfully" % path, state=state) except OSError as error: state = log.userOutput("Directory '%s' can not be created" % path, state=state) for idname in dbId: state = log.userOutput(idname, state=state) try: base_url = 'https://pubmed.ncbi.nlm.nih.gov/' r = s.get(base_url + idname + '/', headers = headers, timeout = 5) try: pdf_url = r.html.find('a.id-link', first=True).attrs['href'] state = log.userOutput(pdf_url, state=state) if "doi" in pdf_url: state = log.userOutput("Not public", state=state) continue r = s.get(pdf_url, headers = headers, timeout = 5) pdf_real = 'https://ncbi.nlm.nih.gov'+r.html.find('a.int-view', first=True).attrs['href'] state = log.userOutput(pdf_real, state=state) r = s.get(pdf_real, stream=True) with open(os.path.join(path, idname + '.pdf'), 'wb') as f: for chunk in r.iter_content(chunk_size = 1024): if chunk: f.write(chunk) except AttributeError: pass state = log.userOutput(f"{idname} could not be gathered.", state=state) except ConnectionError as e: pass state = log.userOutput(f"{idname} could not be gathered.", state=state) else: state = log.userOutput(f"No paper associated with {identifier} found on PubMed", state=state)
#THIS ONE WORKS BETTER
[docs] def annotations(ids, state): """ Downloads annotations for a specified gene product. :param ids: The gene product identifier for which the annotations are to be downloaded. :type ids: str :raises requests.HTTPError: If the HTTP request to download the annotations fails. """ # Auth: Marc Choi # machoi@umich.edu try: path = os.path.abspath(os.getcwd()) + '/go_annotations' os.makedirs(path, exist_ok = True) state = log.userOutput("Directory '%s' created successfully" % path, state=state) except OSError as error: state = log.userOutput("Directory '%s' can not be created" % path, state=state) requestURL = "https://amigo.geneontology.org/amigo/search/annotation?q="+ids real_id = ids[3:] fetch_annotation(real_id, path)
[docs] def fetch_annotation(id, path): # URL to fetch data from url = 'https://golr-aux.geneontology.io/solr/select?defType=edismax&qt=standard&indent=on&wt=csv&rows=100000&start=0&fl=source%2Cbioentity_internal_id%2Cbioentity_label%2Cqualifier%2Cannotation_class%2Creference%2Cevidence_type%2Cevidence_with%2Caspect%2Cbioentity_name%2Csynonym%2Ctype%2Ctaxon%2Cdate%2Cassigned_by%2Cannotation_extension_class%2Cbioentity_isoform&facet=true&facet.mincount=1&facet.sort=count&json.nl=arrarr&facet.limit=25&hl=true&hl.simple.pre=%3Cem%20class%3D%22hilite%22%3E&hl.snippets=1000&csv.encapsulator=&csv.separator=%09&csv.header=false&csv.mv.separator=%7C&fq=document_category:%22annotation%22&facet.field=aspect&facet.field=taxon_subset_closure_label&facet.field=type&facet.field=evidence_subset_closure_label&facet.field=regulates_closure_label&facet.field=isa_partof_closure_label&facet.field=annotation_class_label&facet.field=qualifier&facet.field=annotation_extension_class_closure_label&facet.field=assigned_by&facet.field=panther_family_label&q=GO%3A'+id+'&qf=annotation_class%5E2&qf=annotation_class_label_searchable%5E1&qf=bioentity%5E2&qf=bioentity_label_searchable%5E1&qf=bioentity_name_searchable%5E1&qf=annotation_extension_class%5E2&qf=annotation_extension_class_label_searchable%5E1&qf=reference_searchable%5E1&qf=panther_family_searchable%5E1&qf=panther_family_label_searchable%5E1&qf=bioentity_isoform%5E1&qf=isa_partof_closure%5E1&qf=isa_partof_closure_label_searchable%5E1' print(url) # Fetch data from the URL response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Read the data into a pandas DataFrame data = StringIO(response.text) df = pd.read_csv(data, sep='\t') # '\t' is the tab separator as specified in the URL # Define the new column names new_column_names = [ 'Source', 'Bioentity Internal ID', 'Bioentity Label', 'Qualifier', 'Annotation Class', 'Reference', 'Evidence Type', 'Evidence With', 'Aspect', 'Bioentity Name', 'Synonym', 'Type', 'Taxon', 'Date', 'Assigned By', 'Annotation Extension Class', 'Bioentity Isoform' ] # Rename the columns directly df.columns = new_column_names # Save the updated DataFrame to a CSV file df.to_csv(path+'/annotations_go_'+id+'.csv', index=False) print("Data with updated column names has been saved to "+path+"/annotations_go_"+id+".csv") else: print(f"Failed to retrieve data. Status code: {response.status_code}")