Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sequence data feature #5

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
8 changes: 7 additions & 1 deletion strAPI/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ def show_allele_freqs(repeat_id: int, db: Session = Depends(get_db)):
# return []
return allfreqs

@app.get("/allseq/", response_model=List[schemas.AlleleSequence], tags=["Repeats"])
def show_allele_seq(repeat_id: int, db: Session = Depends(get_db)):
allseq = db.query(models.AlleleSequence).filter(models.AlleleSequence.repeat_id == repeat_id).all()
return allseq


"""
Retrieve repeat info given a repeat id

Expand Down Expand Up @@ -457,4 +463,4 @@ def get_crc_expr_repeatlen_corr(db: Session = Depends(get_db), limit = 7000):
joinedload(models.CRCExprRepeatLenCorr.gene), joinedload(models.CRCExprRepeatLenCorr.repeat)
).all()

return [{**c.gene.__dict__, **c.repeat.__dict__, **c.__dict__} for c in correlations]
return [{**c.gene.__dict__, **c.repeat.__dict__, **c.__dict__} for c in correlations]
27 changes: 24 additions & 3 deletions strAPI/repeats/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class Transcript(SQLModel, table=True):
end: int = Field(nullable=False)

# one to many Gene -> Transcripts
gene_id = Field(Integer, foreign_key ="genes.id")
gene_id: int = Field(Integer, foreign_key ="genes.id")

gene: "Gene" = Relationship(back_populates="transcripts")

Expand Down Expand Up @@ -183,7 +183,23 @@ class AlleleFrequency(SQLModel, table=True):
# One to many, Repeat -> AlleleFrequency
repeat_id: int = Field(foreign_key = "repeats.id")
repeat: "Repeat" = Relationship(back_populates="allfreqs")



class AlleleSequence(SQLModel, table=True):
__tablename__ = "allele_sequences"

id: int = Field(primary_key=True)

population: str = Field(nullable=False)
n_effective: int = Field(nullable=False)
frequency: float = Field(nullable=False)
num_called: Optional[int] = Field(nullable=True)
sequence: str = Field(nullable=False)

# One to many, Repeat -> AlleleSequence
repeat_id: int = Field(foreign_key = "repeats.id")
repeat: "Repeat" = Relationship(back_populates="allseq")


class Repeat(SQLModel, table=True):
__tablename__ = "repeats"
Expand Down Expand Up @@ -221,7 +237,12 @@ class Repeat(SQLModel, table=True):
allfreqs: Optional[List["AlleleFrequency"]] = Relationship(
back_populates="repeat"
)


# Add relationship directive to Repeat class for one to many Repeat - AlleleSequence
allseq: Optional[List["AlleleSequence"]] = Relationship(
back_populates="repeat"
)

# many to many Repeats <-> Transcripts
transcripts: List["Transcript"] = Relationship(back_populates="repeats", link_model = RepeatTranscriptsLink)

Expand Down
13 changes: 12 additions & 1 deletion strAPI/repeats/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@ class AlleleFrequency(BaseModel):
class Config:
orm_mode = True

class AlleleSequence(BaseModel):
population: str
n_effective: int
frequency: float
num_called: int
repeat_id: int
sequence: str

class Config:
orm_mode = True

class CRCVariation(BaseModel):
tcga_barcode: str
sample_type: str
Expand Down Expand Up @@ -123,4 +134,4 @@ class Config:
orm_mode = True




145 changes: 145 additions & 0 deletions update_tf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import argparse
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
from strAPI.repeats.models import Repeat, Gene, TRPanel, AlleleSequence
import math
import os
from tqdm import tqdm

#chrom start end acount-AFR acount-AMR acount-EAS acount-EUR acount-SAS
#chr10 89246 89285 CACATACACACACACACACACACACACACACAC:2,CACATACACACACACACACACACACACACACACAC:261,CACATACACACACACACACACACACACACACA
def round_sf(number):
if number == 0:
return 0.0
else:
#using 2 because we decided on 2 sf
precision = 2 - int(math.floor(math.log10(abs(number)))) - 1
return round(number, precision)


def connection_setup(db_path):
engine = create_engine(db_path, echo=False)
Session = sessionmaker(bind=engine)
session = Session()
return engine, session


def process_file(filepath, session, error_log_file, skipped_lines_file):
with open(filepath, "r") as file:
next(file)

pop_indices = {"AFR": 3, "AMR": 4, "EAS": 5, "EUR": 6, "SAS": 7}
total_lines = sum(1 for line in file) # Count total lines in the file

# Reset file pointer to the beginning of the file
file.seek(0)

# Create tqdm progress bar for the current file
progress_bar = tqdm(total=total_lines, desc=f"Processing {os.path.basename(filepath)}", unit="line")

for line_number, line in enumerate(file, start=1):
progress_bar.update(1) # Increment progress bar by 1 for each line processed
columns = line.strip().split()

try:
#print("Columns:", columns)
db_repeat = session.query(Repeat).filter(
Repeat.source == 'EnsembleTR',
Repeat.start == int(columns[1]),
Repeat.end.between(int(columns[2]) - 2, int(columns[2]) + 2),
Repeat.chr == columns[0]
).one()
# print("Database Repeat:", db_repeat)

except ValueError as ex:
# Handle the case where the start column cannot be converted to an integer
error_message = f"Invalid value for start column: {ex} with line contents: {columns}"
#print(error_message)
error_log_file.write(error_message + '\n') # Write error message to log file
continue

except IndexError as ex:
# Handle the case where there are not enough columns in the line
error_message = f"Not enough columns in line: {line}"
#print(error_message)
error_log_file.write(error_message + '\n') # Write error message to log file
continue

except MultipleResultsFound as ex:
error_message = f"Multiple results found for chr: {columns[0]}:{columns[1]}-{columns[2]}"
error_log_file.write(error_message + '\n') # Write error message to log file
continue

except NoResultFound as ex:
error_message = f"No matching record found for chr: {columns[0]}:{columns[1]}-{columns[2]}"
error_log_file.write(error_message + '\n') # Write error message to log file
continue

##chrom start end acount-AFR acount-AMR acount-EAS acount-EUR acount-SAS
#chr10 89246 89285 CACATACACACACACACACACACACACACACAC:2,CACATACACACACACACACACACACACACACACAC:261,

for pop in ["AFR", "AMR", "EAS", "EUR", "SAS"]:
popdata = columns[pop_indices[pop]]
if popdata != ".": # Check if there is any data for this population
popdata_items = popdata.split(",")
#CACATACACACACACACACACACACACACACAC:2

# If there's only one sequence or more, calculate total counts accordingly
if len(popdata_items) == 1 and ':' in popdata_items[0]:
total = int(popdata_items[0].split(":")[1])
else:
total = sum(int(item.split(":")[1]) for item in popdata_items if ':' in item)

for a in popdata_items:
allele_items = a.split(":")
if len(allele_items) == 2:
aseq, acount = allele_items
acount = int(acount)
freq = round_sf(acount / total) if total > 0 else 0
db_seq = AlleleSequence(
repeat_id=db_repeat.id,
population=pop,
n_effective=db_repeat.n_effective,
frequency=freq,
num_called=acount,
sequence=aseq
)
db_repeat.allseq.append(db_seq)
else:
# Log or handle the case where an allele doesn't have the expected format
skipped_lines_file.write(f"Skipped or malformed allele entry at line {line_number}: {line}\n")


# Close the tqdm progress bar
progress_bar.close()



def main():
default_path = "postgresql://webstr:webstr@localhost:5432/strdb"
default_error_log = "error_chr1.log"
default_skipped_lines_file = "skipped_lines_chr1.txt"
default_file ="/gymreklab-tscc/creeve/chr/chr1.tab"
parser = argparse.ArgumentParser(description="Insert data into PostgreSQL database")
parser.add_argument("--db_path", type=str, default=default_path, help="PostgreSQL connection URL")
parser.add_argument("--file", type=str, default=default_file,help="File to process")
parser.add_argument("--error_log", type=str, default=default_error_log, help="Path to error log file")
parser.add_argument("--skipped_lines_file", type=str, default=default_skipped_lines_file, help="Path to skipped lines file")

args = parser.parse_args()
engine, session = connection_setup(args.db_path)

with open(args.error_log, 'w') as error_log_file, \
open(args.skipped_lines_file, 'w') as skipped_lines_file:
process_file(args.file, session, error_log_file, skipped_lines_file)

session.commit()
session.close()
engine.dispose()
print("Data inserted successfully")

if __name__ == "__main__":
main()