1
+ import os
1
2
import re
2
3
import argparse
3
4
from collections import *
6
7
from utils import get_context , update_words
7
8
8
9
index_categories = ['title' , 'text' , 'category' , 'infobox' ]
10
+ final_index_files = list ()
9
11
10
12
pagination_val = 10
13
+ page_tags = open ("./data/title_tags.txt" )
11
14
12
15
def make_index (iter_doc ):
13
16
num_pages = 0
@@ -48,6 +51,7 @@ def make_index(iter_doc):
48
51
try :
49
52
text = text .lower ()
50
53
title = text + "\n "
54
+ page_tags .write (title )
51
55
update_words (word_dict ["title" ], text , istext = True )
52
56
except :
53
57
pass
@@ -64,10 +68,8 @@ def make_index(iter_doc):
64
68
65
69
if num_pages % pagination_val == 0 :
66
70
for key , value in index_dict .items ():
67
- # print(key)
68
- # print(value)
69
71
file = "./output/" + key [0 :2 ]+ str (output_file_num )+ ".txt"
70
- o = open (file , "w" )
72
+ o = open (titlefile , "w" )
71
73
for word in sorted (value ):
72
74
index = "," .join (value [word ])
73
75
index = word + "-" + index + "\n "
@@ -77,11 +79,32 @@ def make_index(iter_doc):
77
79
for key , val in index_dict .items ():
78
80
index_dict [key ].clear ()
79
81
elem .clear ()
82
+ return output_file_num
83
+
84
+ def make_heaped_index (numer_of_files ):
85
+ for category in index_categories :
86
+ cat_inp = list ()
87
+ cat_fp = open ("output/" + category + ".txt" , "w" )
88
+ final_index_files .append (cat_fp )
89
+ for i in range (numer_of_files ):
90
+ temp_file = "output/" + cat [0 :2 ] + str (i ) + ".txt"
91
+ if os .stat (temp_file ).st_size != 0 :
92
+ temp_file_fp = open (temp_file , "r" )
93
+ cat_inp .append (temp_file_fp )
94
+ else :
95
+ pass
96
+ if (len (cat_inp ) == 0 ):
97
+ break
98
+
99
+
100
+
101
+
80
102
81
103
82
104
def main (data , output_dep_folder ):
83
105
iter_doc = get_context (data )
84
- make_index (iter_doc )
106
+ num = make_index (iter_doc )
107
+ make_heaped_index (num )
85
108
86
109
if __name__ == '__main__' :
87
110
0 commit comments