Skip to content

Commit 4eb3d8d

Browse files
committed
heap index inital
1 parent d40d6fa commit 4eb3d8d

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

invertedindex.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import re
23
import argparse
34
from collections import *
@@ -6,8 +7,10 @@
67
from utils import get_context, update_words
78

89
index_categories = ['title', 'text', 'category', 'infobox']
10+
final_index_files = list()
911

1012
pagination_val = 10
13+
page_tags = open("./data/title_tags.txt")
1114

1215
def make_index(iter_doc):
1316
num_pages = 0
@@ -48,6 +51,7 @@ def make_index(iter_doc):
4851
try:
4952
text = text.lower()
5053
title = text+"\n"
54+
page_tags.write(title)
5155
update_words(word_dict["title"], text, istext=True)
5256
except:
5357
pass
@@ -64,10 +68,8 @@ def make_index(iter_doc):
6468

6569
if num_pages % pagination_val == 0:
6670
for key, value in index_dict.items():
67-
# print(key)
68-
# print(value)
6971
file = "./output/"+key[0:2]+str(output_file_num)+".txt"
70-
o = open(file, "w")
72+
o = open(titlefile, "w")
7173
for word in sorted(value):
7274
index = ",".join(value[word])
7375
index = word + "-" + index + "\n"
@@ -77,11 +79,32 @@ def make_index(iter_doc):
7779
for key, val in index_dict.items():
7880
index_dict[key].clear()
7981
elem.clear()
82+
return output_file_num
83+
84+
def make_heaped_index(numer_of_files):
85+
for category in index_categories:
86+
cat_inp = list()
87+
cat_fp = open("output/" + category + ".txt", "w")
88+
final_index_files.append(cat_fp)
89+
for i in range(numer_of_files):
90+
temp_file = "output/" + cat[0:2] + str(i) + ".txt"
91+
if os.stat(temp_file).st_size != 0:
92+
temp_file_fp = open(temp_file, "r")
93+
cat_inp.append(temp_file_fp)
94+
else:
95+
pass
96+
if(len(cat_inp) == 0):
97+
break
98+
99+
100+
101+
80102

81103

82104
def main(data, output_dep_folder):
83105
iter_doc = get_context(data)
84-
make_index(iter_doc)
106+
num = make_index(iter_doc)
107+
make_heaped_index(num)
85108

86109
if __name__ == '__main__':
87110

0 commit comments

Comments
 (0)