-
Notifications
You must be signed in to change notification settings - Fork 6
Eliminating Duplications (just the history of the application)
James Kent edited this page Mar 22, 2024
·
2 revisions
from sqlalchemy import func
query = session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
query = db.session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
empty_string_dois = BaseStudy.query.filter_by(doi="").all()
empty_string_dois
for bs in empty_string_dois:
bs.doi = None
db.session.add_all(empty_string_dois)
db.session.commit()
empty_string_pmids = BaseStudy.query.filter_by(pmid="").all()
for bs in empty_string_pmids:
bs.pmids = None
db.session.add_all(empty_string_pmids)
db.session.commit()
query = db.session.query(BaseStudy.doi, func.count(BaseStudy.doi)).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois
dup_dois = query.all()
dup_dois
query = db.session.query(BaseStudy.doi, BaseStudy.id).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
db.session.rollback()
dup_dois = query.all()
dup_dois
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, BaseStudy.id).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.group_concat(BaseStudy.id)).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.string_agg(BaseStudy.id, ',')).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
db.session.rollback()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi, func.string_agg(BaseStudy.id, ',')).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
db.session.rollback()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
dup_dois[0][0]
BaseStudy.query.filter_by(dup_dois[0][0])
BaseStudy.query.filter_by(doi=dup_dois[0][0])
BaseStudy.query.filter_by(doi=dup_dois[0][0]).all()
pip install biopython
from Bio import Entrez
def doi_to_pmid(doi):
Entrez.email = "your@email.com" # Put your email here
handle = Entrez.esearch(db="pubmed", term=doi)
record = Entrez.read(handle)
handle.close()
pmids = record['IdList']
return pmids
from Bio import Entrez
def doi_to_pmid(doi):
Entrez.email = "jamesdkent21@gmail.com" # Put your email here
handle = Entrez.esearch(db="pubmed", term=doi)
record = Entrez.read(handle)
handle.close()
pmids = record['IdList']
return pmids
doi_to_pmid(dup_dois[0][0])
dup_bs = BaseStudy.query.filter_by(doi=dup_dois[0][0]).all()
dup_bs
dup_bs[0].pmid
dup_bs[1].pmid
def get_paper_doi(pmid):
Entrez.email = "your@email.com" # Put your email here
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
record = Entrez.read(handle)
handle.close()
if record['PubmedArticle']:
article = record['PubmedArticle'][0]['PubmedData']['ArticleIdList']
for item in article:
if item.attributes['IdType'] == 'doi':
return item
return "DOI not found for the given PMID."
get_paper_doi(dup_bs[0].pmid)
res = get_paper_doi(dup_bs[0].pmid)
str(res)
def get_paper_doi(pmid):
Entrez.email = "jamesdkent@gmail.com" # Put your email here
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
record = Entrez.read(handle)
handle.close()
if record['PubmedArticle']:
article = record['PubmedArticle'][0]['PubmedData']['ArticleIdList']
for item in article:
if item.attributes['IdType'] == 'doi':
return str(item)
return "DOI not found for the given PMID."
to_commit = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
for bs in dup_bs:
if bs.pmid == pmid:
continue
if bs.pmid:
doi = get_paper_doi(bs.pmid) if "found" not in doi else bs.doi
if doi != bs.doi:
bs.doi = doi
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.doi = doi
to_commit.append(v)
to_commit = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
for bs in dup_bs:
if bs.pmid == pmid:
continue
if bs.pmid:
doi = get_paper_doi(bs.pmid)
doi = doi if "found" not in doi else bs.doi
if doi != bs.doi:
bs.doi = doi
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.doi = doi
to_commit.append(v)
to_commit
db.session.add_all(to_commit)
db.session.commit()
dup_bs = BaseStudy.query.filter_by(doi=dup_dois[0][0]).all()
dup_bs
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
bss = BaseStudy.query.filter_by(doi="10.1016/j.cortex.2013.01.015").all()
bss
bss[0].doi
bss[0].pmid
bss[1].pmid
bss[1].name
bss[0].name
doi_to_pmid(bss[0].doi)
get_paper_doi('23994216')
to_commit = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
for bs in dup_bs:
if bs.pmid == pmid:
continue
if bs.pmid:
doi = get_paper_doi(bs.pmid)
doi = doi if "found" not in doi else bs.doi
if doi != bs.doi:
bs.doi = doi
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.doi = doi
to_commit.append(v)
db.session.add_all(to_commit)
db.session.commit()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
to_commit = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
for bs in dup_bs:
if bs.pmid == pmid:
continue
if bs.pmid:
doi = get_paper_doi(bs.pmid)
doi = doi if "found" not in doi else bs.doi
if doi != bs.doi:
bs.doi = doi
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.doi = doi
to_commit.append(v)
to_commit
bss = BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all()
bss
bss[0].pmid
bss[1].pmid
doi_to_pmid(bss[1].pmid)
doi_to_pmid(bss[1].doi)
doi_to_pmid(bss[0].pmid)
doi_to_pmid(bss[0].doi)
doi_to_pmid(bss[1].pmid)
to_commit = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])[0]
non_matching_bs = []
for bs in dup_bs:
if bs.pmid == pmid:
continue
if bs.pmid:
doi = get_paper_doi(bs.pmid)
doi = doi if "found" not in doi else bs.doi
if doi != bs.doi:
bs.doi = doi
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.doi = doi
to_commit.append(v)
query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1)
dup_pmids = query.all()
dup_pmids
empty_string_pmids = BaseStudy.query.filter_by(pmid='').all()
len(empty_string_pmids)
to_commit=[]
for bs in empty_string_pmids:
bs.pmid = None
to_commit.append(bs)
for v in bs.versions:
v.pmid = None if v.pmid == '' else v.pmid
to_commit.append(v)
db.session.add_all(to_commit)
db.session.commit()
empty_string_pmids = BaseStudy.query.filter_by(pmid='').all()
empty_string_pmids
query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1)
dup_pmids = query.all()
dup_pmids
BaseStudy.query.filter_by(pmid="10669519").all()
bss = BaseStudy.query.filter_by(pmid="10669519").all()
bss[0]
bss[0].doi
bss[1].doi
bss[0].name
bss[1].name
bss[1].user
bss[0].user
bss[1].user
bss[1].doi
bss[1].pmid
bss[1].versions
bss[0].versions
bss[0].versions[0].user
bss[0].versions[0].source
bss[0].versions[1].source
bss[0].versions[2].source
bss[0].abstracts
bss[0].description
bss[1].description
to_commit = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
for bs in dup_bs:
if bs.doi == doi:
continue
if bs.doi:
pmid = doi_to_pmid(bs.doi)[0]
if pmid != bs.pmid:
bs.pmid = pmid
to_commit.append(bs)
print(f"CHANGING {bs.name}")
for v in bs.versions:
v.pmid = pmid
to_commit.append(v)
to_commit
db.session.add_all(to_commit)
db.session.commit()
query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1)
dup_pmids = query.all()
dup_pmids
to_commit = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.doi == doi:
name = bs.name
else:
non_matching_dups.append(bs)
non_matching_dups
import re
def preprocess_string(string):
# Remove non-alphanumeric characters and whitespace
return re.sub(r'[^a-zA-Z0-9]', '', string)
def compare_strings(string1, string2):
processed_string1 = preprocess_string(string1)
processed_string2 = preprocess_string(string2)
return processed_string1 == processed_string2
to_commit = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.doi == doi:
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
to_commit = []
to_delete = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.doi == doi:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit = []
to_delete = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.doi == doi:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_commit
to_delete
to_delete[0].versions
to_delete[1].versions
to_delete[-`].versions
to_delete[-1].versions
for bs in to_delete:
db.session.delete(bs)
db.session.commit()
db.session.add_all(to_commit)
db.session.commit()
query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1)
dup_pmids = query.all()
dup_pmids
BaseStudy.query.filter_by(pmid='16371250').all()
bss = BaseStudy.query.filter_by(pmid='16371250').all()
bss[0]
bss[0].name
bss[1].name
compare_strings(bss[0].name, bss[1].name)
preprocess_string(bss[0].name)
preprocess_string(bss[1].name)
preprocess_string(bss[0].name) == preprocess_string(bss[1].name)
preprocess_string(bss[0].name).lower() == preprocess_string(bss[1].name).lower()
import re
def preprocess_string(string):
# Remove non-alphanumeric characters and whitespace
return re.sub(r'[^a-zA-Z0-9]', '', string)
def compare_strings(string1, string2):
processed_string1 = preprocess_string(string1).lower()
processed_string2 = preprocess_string(string2).lower()
return processed_string1 == processed_string2
to_commit = []
to_delete = []
for dup in dup_pmids:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
doi = get_paper_doi(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.doi == doi:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_delete
to_delete[0]
to_delete[0].name
to_delete[4].name
to_delete[5].name
to_delete[6].name
to_delete[12].name
to_delete[12].doi
to_delete[12].pmid
to_delete[13].pmid
to_delete[13].name
to_delete[13].doi
to_delete[13].versions
to_commit
to_commit[12]
to_commit[12].name
to_commit[12].versions
to_delete[13].user
to_delete[12].user
for bs in to_delete:
db.session.delete(bs)
db.session.add_all(to_commit)
db.session.commit()
query = db.session.query(BaseStudy.pmid).group_by(BaseStudy.pmid).having(func.count(BaseStudy.pmid) > 1)
dup_pmids = query.all()
dup_pmids
dup_dois
BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all()
bss = BaseStudy.query.filter_by(doi="10.1016/j.neurobiolaging.2023.08.008").all()
bss[0].pmid
bss[1].pmid
bss[1].name
bss[0].name
bss[0].user
bss[0].versions
bss[1].versions
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name, name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
pmid = doi_to_pmid(dup[0])
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name or '', name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
bss[0].name
bss[1].name
compare_strings(bss[0], bss[1])
compare_strings(bss[0].name, bss[1].name)
dup_dois
bss[0].name
bss[0].pmis
bss[0].pmid
bss[0].doi
get_paper_doi(bss[0]).doi
get_paper_doi(bss[0])
get_paper_doi(bss[0].doi)
get_paper_doi(bss[0].pmid)
get_paper_doi(bss[0].pmid)
get_paper_doi(bss[0].doi)
bss[0].doi
doi_to_pmid(bss[0].doi)
get_paper_doi(bss[0].pmid)
get_paper_doi(bss[1].pmid)
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
pmid = doi_to_pmid(dup[0])[0]
non_matching_bs = []
name = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name or '', name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_commit
doi_to_pmid(bss[0].doi)
doi_to_pmid(bss[0].doi)[0]
bss[0].pmid
bss[1].pmid
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(pmid=dup[0]).all()
pmid = doi_to_pmid(dup[0])[0]
non_matching_bs = []
name = None
main_bs = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
if not main_bs:
print('no main bs found')
continue
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name or '', name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
dup_dois
to_commit = []
to_delete = []
for dup in dup_dois:
dup_bs = BaseStudy.query.filter_by(doi=dup[0]).all()
pmid = doi_to_pmid(dup[0])[0]
non_matching_bs = []
name = None
main_bs = None
non_matching_dups = []
for bs in dup_bs:
if bs.pmid == pmid:
main_bs = bs
name = bs.name
else:
non_matching_dups.append(bs)
if not main_bs:
print('no main bs found')
continue
for nm_bs in non_matching_dups:
match = compare_strings(nm_bs.name or '', name)
if match:
print(f"DUP: {name}")
main_bs.versions.extend(nm_bs.versions)
for attr in ['name', 'description', 'metadata_', 'publication', 'authors', 'year', 'level']:
setattr(main_bs, attr, getattr(main_bs, attr, getattr(nm_bs, attr, None)))
to_delete.append(nm_bs)
to_commit.append(main_bs)
to_delete
to_delete[0].name
to_delete[0].versions
to_commit
to_commit[0].name
to_commit[0].doi
to_commit[0].pmid
for bs in to_delete:
db.session.delete(bs)
db.session.add_all(to_commit)
db.session.commit()
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
BaseStudy.query.filter_by(doi="10.1016/j.neuroimage.2014.06.047").all()
bss = BaseStudy.query.filter_by(doi="10.1016/j.neuroimage.2014.06.047").all()
bss[0]
bss[0].pmid
bss[0].name
bss[1].name
bss[1].pmid
bss[1].user
bss[0].user
bss[1].versions
bss[1].versions[0]
bss[1].versions[0].source
bss[1].versions[1].source
bss[0].versions[1].source
bss[0].versions[0].source
bss[0].versions[0]
bss[0].versions[0].name
bss[0].pmid
bss[1].version
bss[1].versions
bss[1].versions.extend(bss[0].versions)
bss[1
]
bss[1].versions
bss[1].publication
bss[0].publication
bss[0].year
bss[1].year
bss[0].versions
bss[0].description
bss[1].description
db.session.delete(bss[0])
db.session.add(bss[1])
db.session.commit()
bss[1].versions
query = db.session.query(BaseStudy.doi).group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1)
dup_dois = query.all()
dup_dois
history