forked from BangLiu/ACS-QG
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDA_main.py
185 lines (165 loc) · 7.25 KB
/
DA_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# -*- coding: utf-8 -*-
"""
Data augmenter main file.
It handles SQuAD or other datasets to select all possible answers, clues, and question types.
The selected data can be read by QG data loader, and generate questions.
"""
# !!! for running experiments on Venus
from config import *
from common.constants import EXP_PLATFORM
if EXP_PLATFORM.lower() == "venus":
from nltk import data
data.path.append('./nltk_need/nltk_data/')
import json
import nltk
import codecs
from tqdm import tqdm
from data_augmentor.FQG_data_augmentor import augment_qg_data, get_sample_probs
from util.file_utils import save, load, make_path
def normalize_text(text):
"""
Replace some special characters in text.
"""
# NOTICE: don't change the text length.
# Otherwise, the answer position is changed.
text = text.replace("''", '" ').replace("``", '" ').replace("\t", " ")
return text
def wiki2sentences(input_path, output_path, paragraphs_path, max_length=100, min_length=5, max_plength=400, min_plength=5):
make_path(output_path)
make_path(paragraphs_path)
outfile = open(output_path, 'w', encoding='utf8')
outfile_p = open(paragraphs_path, 'w', encoding='utf8')
with codecs.open(input_path, encoding='utf8') as infile:
data = json.load(infile)
pid = 0
sid = 0
for k in data:
paragraph_list = data[k]
for p in paragraph_list:
len_p = len(p.split())
if len_p >= max_plength or len_p <= min_plength:
continue
p = normalize_text(p)
outfile_p.write(str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n")
sentences = nltk.sent_tokenize(p)
for s in sentences:
len_s = len(s.split())
if len_s >= max_length or len_s <= min_length:
continue
s = normalize_text(s)
outfile.write(str(pid) + "\t" + str(sid) + "\t" + s.rstrip().replace("\n", "\\n") + "\n")
sid += 1
pid += 1
infile.close()
outfile.close()
outfile_p.close()
def squad2sentences(input_path, output_path, paragraphs_path,
max_length=100, min_length=5, max_plength=400, min_plength=5):
make_path(output_path)
make_path(paragraphs_path)
outfile = open(output_path, 'w', encoding='utf8')
outfile_p = open(paragraphs_path, 'w', encoding='utf8')
with codecs.open(input_path, "r", encoding='utf8') as infile:
source = json.load(infile)
pid = 0
sid = 0
for article in tqdm(source["data"]):
for para in article["paragraphs"]:
context = para["context"]
p = context
len_p = len(p.split())
if len_p >= max_plength or len_p <= min_plength:
continue
p = normalize_text(p)
outfile_p.write(str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n")
sentences = nltk.sent_tokenize(context)
for s in sentences:
len_s = len(s.split())
if len_s >= max_length or len_s <= min_length:
continue
s = normalize_text(s)
outfile.write(str(pid) + "\t" + str(sid) + "\t" + s.rstrip().replace("\n", "\\n") + "\n")
sid += 1
pid += 1
infile.close()
outfile.close()
outfile_p.close()
def file2sentences(input_path, data_type, output_path, paragraphs_path,
max_length=100, min_length=5, max_plength=400, min_plength=5):
if data_type.lower() == "wiki10000":
wiki2sentences(input_path, output_path, paragraphs_path, max_length, min_length, max_plength, min_plength)
elif data_type.lower() == "squad":
squad2sentences(input_path, output_path, paragraphs_path, max_length, min_length, max_plength, min_plength)
else:
print("The data_type must be wiki10000 or squad...")
def sentences2augmented_sentences(input_path, output_path, start_index, end_index, sample_probs,
num_sample_answer=5, num_sample_clue=2, num_sample_style=2,
max_sample_times=20):
augmented_sentences = []
with codecs.open(input_path, "r", encoding='utf8') as infile:
sentences = infile.readlines()
assert start_index < end_index
assert start_index < len(sentences)
assert end_index <= len(sentences)
print("Start augment data...")
for i in range(start_index, end_index):
print(i)
s_split = sentences[i].rstrip().split("\t")
pid = s_split[0]
sid = s_split[1]
s = s_split[2]
# augmented_s = augment_qg_data(s) # NOTICE: for FQG_data_augmentor_old
augmented_s = augment_qg_data(
s, sample_probs,
num_sample_answer, num_sample_clue, num_sample_style,
max_sample_times)
augmented_s["pid"] = pid
augmented_s["sid"] = sid
augmented_sentences.append(augmented_s)
save(output_path, augmented_sentences, "save augmented sentences...")
infile.close()
def main(args):
# prepro files
CURRENT_PATH = os.getcwd().split("/")
DATA_PATH = "/".join(CURRENT_PATH[:-4]) + "/Datasets/"
DATA_ACS_INFO_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_ans_clue_style_info.pkl"
SAMPLE_PROBS_FILE_PATH = DATA_PATH + "processed/SQuAD1.1-Zhou/squad_sample_probs.pkl"
SQUAD_FILE = DATA_PATH + "original/SQuAD1.1-Zhou/train.txt"
# !!!NOTICE: remember to clear these files when needed, otherwise we won't re-calculate.
if not os.path.isfile(SAMPLE_PROBS_FILE_PATH) or args.not_processed_sample_probs_file:
print(SAMPLE_PROBS_FILE_PATH + " not exist.\nNow start generate these files.\n")
# if not exist, generate mapping dict and save to file
get_sample_probs(
filename=SQUAD_FILE, filetype="squad",
save_dataset_info_file=DATA_ACS_INFO_FILE_PATH, save_sample_probs_file=SAMPLE_PROBS_FILE_PATH,
sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20,
debug=args.debug, debug_length=20,
answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30,
clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20)
SAMPLE_PROBS = load(SAMPLE_PROBS_FILE_PATH)
print(SAMPLE_PROBS_FILE_PATH + " loaded.\n")
# excute tasks
if args.debug:
args.da_start_index = 0
args.da_end_index = 10
if args.da_task == "file2sentences":
file2sentences(
args.da_input_file,
args.da_input_type,
args.da_sentences_file,
args.da_paragraphs_file,
max_plength=args.para_limit,
max_length=args.sent_limit)
if args.da_task == "sentences2augmented_sentences":
sentences2augmented_sentences(
args.da_sentences_file,
args.da_augmented_sentences_file,
args.da_start_index,
args.da_end_index,
SAMPLE_PROBS,
args.num_sample_answer,
args.num_sample_clue,
args.num_sample_style,
args.max_sample_times)
if __name__ == '__main__':
main(parser.parse_args())