Skip to content

Commit 463811c

Browse files
authored
Merge pull request #28 from pharmorg/bulk_insert
Decrease time to load DB to approximately 5 minutes instead of 60 minutes.
2 parents 461e027 + c29e412 commit 463811c

File tree

3 files changed

+111
-56
lines changed

3 files changed

+111
-56
lines changed

openfda/app/db/models.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ class Drugs(Base):
1616
pharm_class = Column(String(100))
1717
dosage_form = Column(String(100))
1818
product_type = Column(Integer, ForeignKey('productType.id'))
19-
routes = relationship("Routes", backref='dx_route')
19+
route1 = Column(Integer, ForeignKey('route.id'))
20+
route2 = Column(Integer, ForeignKey('route.id'))
21+
route3 = Column(Integer, ForeignKey('route.id'))
2022
pharm_classes = relationship("PharmClasses", backref="dx_pharmClass")
2123

2224
def __repr__(self):
@@ -29,7 +31,7 @@ class Routes(Base):
2931

3032
id = Column(Integer, primary_key=True)
3133
route = Column(String(100))
32-
drug_id = Column(Integer, ForeignKey('drug.id'))
34+
3335

3436
def __repr__(self):
3537
return "<Routes(route='%s')>" % (self.route)

openfda/app/utils/load_data.py

+106-53
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import requests
22
import sys
33
import json
4+
import time
45
from pathlib import Path
56
from db.models import Drugs, Routes, ProductTypes, PharmClasses
67
from db.connect import connection
@@ -24,64 +25,110 @@ def buildProductTypes(session, data):
2425
for item in responses:
2526
exists = session.query(ProductTypes).filter(ProductTypes.product_type == item).scalar()
2627
if not exists:
27-
logger.info(f"New response added: {item}\n")
28+
logger.debug(f"New response added: {item}\n")
2829
row = ProductTypes(product_type = item)
2930
session.add(row)
3031

31-
def addData(session, data):
32+
def buildRouteTypes(session, data):
33+
routes = set()
3234
try:
35+
for item in data['results']:
36+
try:
37+
for route in item['route']:
38+
routes.add(route)
39+
except:
40+
pass
41+
except:
42+
logger.error("Failed to load routes table")
43+
return
44+
logger.debug(f"New data route(s): {str(routes)}\n")
45+
for item in routes:
46+
logger.debug(item)
47+
exists = session.query(Routes).filter(Routes.route == item).scalar()
48+
if not exists:
49+
logger.debug(f"New response added: {item}\n")
50+
row = Routes(route = item)
51+
session.add(row)
52+
53+
54+
def buildDrugs(session, drugs_data):
55+
objects = []
56+
product_ids = []
57+
for data in drugs_data['results']:
3358
try:
34-
product_id = data['product_id']
59+
try:
60+
product_id = data['product_id']
61+
except:
62+
logger.warning(f"A product ID does not exists for {data['generic_name']}")
63+
return
64+
try:
65+
generic_name = data['generic_name'].lower()[:300]
66+
except:
67+
generic_name = ""
68+
try:
69+
brand_name = data['brand_name'].lower()[:300]
70+
except:
71+
brand_name = ""
72+
try:
73+
classList = []
74+
for classItem in data['pharm_class']:
75+
classList.append(classItem)
76+
except:
77+
classList = []
78+
try:
79+
routesList = []
80+
for route in data['route']:
81+
route_id = session.query(Routes.id).filter(Routes.route == route)
82+
routesList.append(route_id)
83+
except:
84+
routesList = []
85+
try:
86+
form = data['dosage_form'].lower()[:100]
87+
except:
88+
form = ""
89+
try:
90+
productTypeId = session.query(ProductTypes.id).filter(ProductTypes.product_type == data['product_type'])
91+
except:
92+
productTypeId = None
3593
except:
36-
logger.warning(f"A product ID does not exists for {data['generic_name']}")
94+
logger.error(f"JSON failure\n")
3795
return
38-
try:
39-
generic_name = data['generic_name'].lower()[:300]
40-
except:
41-
generic_name = ""
42-
try:
43-
brand_name = data['brand_name'].lower()[:300]
44-
except:
45-
brand_name = ""
46-
try:
47-
classList = []
48-
for classItem in data['pharm_class']:
49-
classList.append(classItem)
50-
except:
51-
classList = []
52-
try:
53-
routesList = []
54-
for route in data['route']:
55-
routesList.append(route.lower())
56-
except:
57-
routesList = []
58-
try:
59-
form = data['dosage_form'].lower()[:100]
60-
except:
61-
form = ""
62-
try:
63-
productTypeId = session.query(ProductTypes.id).filter(ProductTypes.product_type == data['product_type'])
64-
except:
65-
productTypeId = None
66-
except:
67-
logger.error(f"JSON failure\n")
68-
return
69-
exists = session.query(Drugs).filter(Drugs.product_id == product_id).scalar()
70-
if not exists:
71-
logger.info(f"New data added: {generic_name}|{brand_name}|{form}|{product_id}\n")
72-
row = Drugs(product_id=product_id, generic_name=generic_name, brand_name=brand_name, dosage_form=form, product_type = productTypeId)
73-
session.add(row)
74-
for route in routesList:
75-
logger.info(f"New data added: {generic_name}|{route}\n")
76-
route_row = Routes(route=route, dx_route=row)
77-
session.add(route_row)
78-
for pharmClass in classList:
79-
logger.info(f"New data added: {generic_name}|{pharmClass}\n")
80-
class_row = PharmClasses(pharm_class=pharmClass, dx_pharmClass=row)
81-
session.add(class_row)
82-
else:
83-
logger.warning(f"Data already exists: {product_id}|{generic_name}|{brand_name}\n")
96+
# exists = session.query(Drugs).filter(Drugs.product_id == product_id).scalar()
97+
if product_id not in product_ids:
98+
product_ids.append(product_id)
99+
logger.debug(f"New data added: {generic_name}|{brand_name}|{form}|{product_id}\n")
100+
if len(routesList) == 3:
101+
route1 = routesList[0]
102+
route2 = routesList[1]
103+
route3 = routesList[2]
104+
elif len(routesList) == 2:
105+
route1 = routesList[0]
106+
route2 = routesList[1]
107+
route3 = None
108+
elif len(routesList) == 1:
109+
route1 = routesList[0]
110+
route2 = None
111+
route3 = None
112+
else:
113+
route1 = None
114+
route2 = None
115+
route3 = None
116+
row = Drugs(product_id=product_id, generic_name=generic_name, brand_name=brand_name, dosage_form=form, route1=route1, route2=route2, route3=route3, product_type=productTypeId)
117+
objects.append(row)
118+
# for route in routesList:
119+
# logger.info(f"New data added: {generic_name}|{route}\n")
120+
# route_row = Routes(route=route, dx_route=row)
121+
# objects.append(route_row)
122+
for pharmClass in classList:
123+
logger.debug(f"New data added: {generic_name}|{pharmClass}\n")
124+
class_row = PharmClasses(pharm_class=pharmClass, dx_pharmClass=row)
125+
objects.append(class_row)
126+
else:
127+
logger.warning(f"Data already exists: {product_id}|{generic_name}|{brand_name}\n")
128+
start = time.process_time()
129+
session.add_all(objects)
84130
session.commit()
131+
logger.info(f"Commit drugs db: {str(time.process_time() - start)}")
85132
return
86133

87134

@@ -101,10 +148,16 @@ def main(session):
101148
# Need a flag / config / etc. to drop tables on demand for rebuild.
102149

103150
# Need improved process to minimze circling through JSON file twice.
151+
start = time.process_time()
104152
buildProductTypes(session, data)
105-
for line in data['results']:
106-
logger.debug(f"{line}")
107-
addData(session, line)
153+
logger.info(f"Add product types: {str(time.process_time() - start)}")
154+
start = time.process_time()
155+
buildRouteTypes(session, data)
156+
logger.info(f"Add routes: {str(time.process_time() - start)}")
157+
start= time.process_time()
158+
buildDrugs(session, data)
159+
logger.info(f"Add drugs information: {str(time.process_time() - start)}")
160+
f.close()
108161

109162

110163
if __name__ == "__main__":

openfda/app/utils/log.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def startLogging(logger_name):
1414
Path(LOG_DIR).mkdir(parents=True, exist_ok=True)
1515
logger = logging.getLogger(logger_name)
1616
formatter = logging.Formatter('[%(asctime)s] p%(process)s {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s','%Y/%m/%d %H:%M:%S')
17-
logger.setLevel(logging.DEBUG)
17+
logger.setLevel(logging.INFO)
1818
# enable file logging
1919
log_filename = datetime.datetime.now().strftime("%y%m%d_") + logger_name + '.log'
2020
filehandler = logging.FileHandler(LOG_DIR / log_filename)

0 commit comments

Comments
 (0)