Create README.md
Browse filesimport requests
import pandas as pd
from transformers import BioGptTokenizer, BioGptForCausalLM
def get_uniprot_data(protein_list):
"""Fetches UniProt data for a list of proteins using the UniProt API."""
base_url = "https://rest.uniprot.org/uniprotkb/search?query="
fields = "accession,id,protein_name,gene_names,organism,comment(FUNCTION)"
results = []
for protein in protein_list:
query_url = f"{base_url}{protein}&fields={fields}&format=tsv"
response = requests.get(query_url)
if response.status_code == 200:
data = response.text.split("\n")
if len(data) > 1:
results.append(data[1].split("\t")) # Extract first result row
else:
print(f"Failed to retrieve {protein}: {response.status_code}")
return results
def analyze_protein_candidates(dataset_path, sheet_name):
"""Analyzes proteins from dataset and retrieves UniProt data."""
df = pd.read_excel(dataset_path, sheet_name=sheet_name)
protein_list = df["Protein Name"].dropna().unique().tolist()
# Fetch UniProt data
uniprot_results = get_uniprot_data(protein_list)
# Convert to DataFrame
columns = ["UniProt ID", "Accession", "Protein Name", "Gene Names", "Organism", "Function"]
df_uniprot = pd.DataFrame(uniprot_results, columns=columns)
# Save to file
df_uniprot.to_csv("uniprot_results.csv", index=False)
print("UniProt data saved to uniprot_results.csv")
return df_uniprot
def generate_hypotheses(df_uniprot):
"""Uses BioGPT to generate hypotheses for protein involvement in insulin-stimulated glucose uptake."""
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
hypotheses = []
for index, row in df_uniprot.iterrows():
text = f"How might {row['Protein Name']} contribute to insulin-stimulated glucose uptake? {row['Function']}"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=200)
hypothesis = tokenizer.decode(outputs[0], skip_special_tokens=True)
hypotheses.append([row["Protein Name"], row["UniProt ID"], row["Function"], hypothesis])
df_hypotheses = pd.DataFrame(hypotheses, columns=["Protein Name", "UniProt ID", "Function", "Hypothesis"])
df_hypotheses.to_csv("protein_hypotheses.csv", index=False)
print("Generated hypotheses saved to protein_hypotheses.csv")
return df_hypotheses
# Usage example
dataset_path = "your_dataset.xlsx"
sheet_name = "4925_2_Uniq_Sig_Incr_Enzymes"
df_uniprot = analyze_protein_candidates(dataset_path, sheet_name)
df_hypotheses = generate_hypotheses(df_uniprot)
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
datasets:
|
4 |
+
- fka/awesome-chatgpt-prompts
|
5 |
+
language:
|
6 |
+
- en
|
7 |
+
metrics:
|
8 |
+
- accuracy
|
9 |
+
base_model:
|
10 |
+
- deepseek-ai/Janus-Pro-7B
|
11 |
+
new_version: deepseek-ai/DeepSeek-R1
|
12 |
+
library_name: adapter-transformers
|
13 |
+
tags:
|
14 |
+
- biology
|
15 |
+
---
|