JacobLasher commited on
Commit
238ea0b
·
verified ·
1 Parent(s): ad57584

Create README.md

Browse files

import requests
import pandas as pd
from transformers import BioGptTokenizer, BioGptForCausalLM

def get_uniprot_data(protein_list):
"""Fetches UniProt data for a list of proteins using the UniProt API."""
base_url = "https://rest.uniprot.org/uniprotkb/search?query="
fields = "accession,id,protein_name,gene_names,organism,comment(FUNCTION)"
results = []

for protein in protein_list:
query_url = f"{base_url}{protein}&fields={fields}&format=tsv"
response = requests.get(query_url)

if response.status_code == 200:
data = response.text.split("\n")
if len(data) > 1:
results.append(data[1].split("\t")) # Extract first result row
else:
print(f"Failed to retrieve {protein}: {response.status_code}")

return results

def analyze_protein_candidates(dataset_path, sheet_name):
"""Analyzes proteins from dataset and retrieves UniProt data."""
df = pd.read_excel(dataset_path, sheet_name=sheet_name)
protein_list = df["Protein Name"].dropna().unique().tolist()

# Fetch UniProt data
uniprot_results = get_uniprot_data(protein_list)

# Convert to DataFrame
columns = ["UniProt ID", "Accession", "Protein Name", "Gene Names", "Organism", "Function"]
df_uniprot = pd.DataFrame(uniprot_results, columns=columns)

# Save to file
df_uniprot.to_csv("uniprot_results.csv", index=False)
print("UniProt data saved to uniprot_results.csv")

return df_uniprot

def generate_hypotheses(df_uniprot):
"""Uses BioGPT to generate hypotheses for protein involvement in insulin-stimulated glucose uptake."""
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

hypotheses = []
for index, row in df_uniprot.iterrows():
text = f"How might {row['Protein Name']} contribute to insulin-stimulated glucose uptake? {row['Function']}"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=200)
hypothesis = tokenizer.decode(outputs[0], skip_special_tokens=True)
hypotheses.append([row["Protein Name"], row["UniProt ID"], row["Function"], hypothesis])

df_hypotheses = pd.DataFrame(hypotheses, columns=["Protein Name", "UniProt ID", "Function", "Hypothesis"])
df_hypotheses.to_csv("protein_hypotheses.csv", index=False)
print("Generated hypotheses saved to protein_hypotheses.csv")

return df_hypotheses

# Usage example
dataset_path = "your_dataset.xlsx"
sheet_name = "4925_2_Uniq_Sig_Incr_Enzymes"
df_uniprot = analyze_protein_candidates(dataset_path, sheet_name)
df_hypotheses = generate_hypotheses(df_uniprot)

Files changed (1) hide show
  1. README.md +15 -0
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - fka/awesome-chatgpt-prompts
5
+ language:
6
+ - en
7
+ metrics:
8
+ - accuracy
9
+ base_model:
10
+ - deepseek-ai/Janus-Pro-7B
11
+ new_version: deepseek-ai/DeepSeek-R1
12
+ library_name: adapter-transformers
13
+ tags:
14
+ - biology
15
+ ---