-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathconfig.properties
138 lines (128 loc) · 3.93 KB
/
config.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
##### GENERAL CONFIGURATIONS
# location of elasticsearch
elasticsearchLoc=/Users/waynetsui/Desktop/GitHub/siamese/elasticsearch-2.2.0
# elasticsearch's server name (or IP)
server=localhost
# elasticsearch's cluster name. See cluster.name in your $elasticsearchLoc/config/elasticsearch.yml
cluster=stackoverflow
# index name
index=test
# type name
type=siamese
# location of the input folder. This is the location of the files to be indexed (if command=index),
# or the location of the queries (if command=search).
inputFolder=/Users/Chaiyong/Documents/phd/2016/cloplag/tests_andrea
# only for GitHub indexing, leave blank if not needed
subInputFolder=
# output folder to store the search results
outputFolder=search_results
# use DFS mode [default=no]
dfs=true
writeToFile=true
# source code file extension
extension=java
# minimum clone size (lines)
minCloneSize=6
# command to execute [index,search]
command=index
# print out logging data
isPrint=false
# output format [csv = filename, csvfline = filename#start#end), gcf = general clone format]
outputFormat=csvfline
# indexing mode [sequential, bulk]
indexingMode=bulk
# size of bulk insert
bulkSize=4000
# clone granularity [method, file]
parseMode=method
# print the progress of indexing/querying in every x files
printEvery=10000
# recreate the index if it exists [true, false]
recreateIndexIfExists=true
##### DELETE SETTINGS
deleteField=
deleteWildcard=
deleteAmount=1000
##### PARSER + TOKENIZER + NORMALIZER SETTINGS
methodParser=crest.siamese.language.java.JavaMethodParser
tokenizer=crest.siamese.language.java.JavaTokenizer
normalizer=crest.siamese.language.java.JavaNormalizer
##### MULTI-REPRESENTATION SETTINGS
multirep=true
enableRep=true,true,true,true
##### NORMALIZATION MODE
##### Code normalisation for T2 and T3 representation.
### Java
# Combination of x (none), w (words), d (datatypes), j (Java classes), p (Java packages),
# k (keywords), v (values), s (strings), o (operators), e (escape).
# T2 norm mode: dsvw
# T3 norm mode: djkopsvw
### Python 3
# Combination of k (keywords), v (values), s (strings), o (operators), w (words)
# T2 norm mode: vsw
# T3 norm mode: kvsow
normalizerMode=crest.siamese.language.java.JavaNormalizerMode
t2NormMode=dsvw
t3NormMode=djkopsvw
# turn on ngram
isNgram=true
# size of ngram.
# representation T3
ngramSize=4
# representation T2
t2NgramSize=4
# representation T1
t1NgramSize=4
##### QUERY-RELATED SETTINGS
# starting result offset (usually zero)
resultOffset=0
# the size of the results
resultsSize=100
# tfidf, bm25, dfr, ib, lmd (LM Dirichlet), lmj (LM Jelinek-Mercer)
rankingFunction=tfidf
# QUERY REDUCTION SETTINGS
# turn on query reduction [true/false]
queryReduction=true
# reduction percentile for the T3 layer [0, 100]
QRPercentileNorm=10
# reduction percentile for the T2 layer [0, 100]
QRPercentileT2=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileT1=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileOrig=10
# boosting for T3 layer
normBoost=4
# boosting for T2 layer
t2Boost=4
# boosting for T1 layer
t1Boost=4
# boosting for T0 layer
origBoost=1
# ignore the query clones
ignoreQueryClones=true
##### LICENSE EXTRACTION
# extract license [true, false]
license=false
# license extractor [ninka, regexp]
licenseExtractor=regexp
##### EXPERIMENT CONFIGURATIONS
# ONLY USED FOR THE EXPERIMENTS OF SIAMESE
# elasticsearch similarity function + ngram + normalisation [or both]
similarityMode=tfidf_text_both
# prefix of the clone cluster file name [cloplag/soco]
cloneClusterFile=soco
# IR error measure [arp/map]
errorMeasure=map
# delete the index after every run?
deleteIndexAfterUse=true
##### SIMILARITY
# compute similarity of the results [fuzzywuzzy, tokenratio, none]
computeSimilarity=tokenratio
# the similarity threshold for the four representations [T1,T2,T3,T4] respectively
simThreshold=50%,60%,70%,80%
# GitHub indexing? (automatically add URL)
github=false
##### DEPRECATED
# (DEPRECATED) no. of total documents in the index
totalDocuments=100