-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathindex.ts
160 lines (132 loc) · 4.71 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import weaviate, { WeaviateClient, ObjectsBatcher } from 'weaviate-ts-client';
import { TSV, Parser } from 'tsv';
import fs from 'fs';
const client: WeaviateClient = weaviate.client({
scheme: 'http',
host: 'localhost:8080',
});
async function deleteClass(className: string) {
client.schema.classGetter().withClassName(className).do().then((res: any) => {
client.schema.classDeleter().withClassName(className).do().then((res: any) => {
console.log('Class deleted: ' + className);
})
.catch((err: Error) => {
console.error('Error while deleting class ' + className, err)
});;
}).catch((err: Error) => {
console.error('Class not deleted, as it does not exist: ' + className)
});
}
async function createClass(schemaObj: any) {
client.schema.classCreator().withClass(schemaObj).do().then((res: any) => {
//console.log(res);
})
.catch((err: Error) => {
console.error(err)
});
}
async function populateData(example: string) {
// Get the data from the data.json file and read the class that is used in the schema
const schemaObj = JSON.parse(fs.readFileSync('./examples/' + example + '/schema.json', 'utf8'));
const className = schemaObj.class;
// We are repopulating all the data, thus we remove existing data (if any) first
deleteClass(className);
createClass(schemaObj);
// Get the data from the data.json file
const { data } = JSON.parse(fs.readFileSync('./examples/' + example + '/data.json', 'utf8'));
// Prepare a batcher
let batcher: ObjectsBatcher = client.batch.objectsBatcher();
let counter: number = 0;
let batchSize: number = 100;
let batchNumber = 1;
// Loop through the data
for (const dataPoint of data) {
// Create object properties from schema (make sure the property names match the schema)
const properties: any = {};
schemaObj.properties.forEach((property: any) => {
properties[property.name] = dataPoint[property.name];
});
// Create an object
const obj = {
class: className,
properties: properties,
}
// Add the object to the batch queue
batcher = batcher.withObject(obj);
// When the batch counter reaches batchSize, push the objects to Weaviate
if (counter++ == batchSize) {
// Flush the batch queue
await batcher
.do()
.catch((err: Error) => {
console.error(err)
});
console.log('Batch ' + batchNumber + '/' + Math.ceil(data.length / batchSize) + ' done. (' + data.length + ' objects in total)');
batchNumber++
// Restart the batch queue
counter = 0;
batcher = client.batch.objectsBatcher();
}
}
// Flush the remaining objects
batcher
.do()
.then((res: any) => {
//console.log(res)
})
.catch((err: Error) => {
console.error(err)
});
}
async function createTsvFiles(example: string) {
const { data } = JSON.parse(fs.readFileSync('./examples/' + example + '/data.json', 'utf8'));
const schema = JSON.parse(fs.readFileSync('./examples/' + example + '/schema.json', 'utf8'));
const className = schema.class;
// create a string of all schema properties seperated by a space
const schemaProperties = schema.properties.map((property: any) => {
return property.name
}).join(' ');
const allVectors = await getAllVectors(className, schemaProperties, data.length)
// remove newlines from the data
allVectors.forEach((dataPoint: any) => {
Object.keys(dataPoint).forEach((key: string) => {
if (typeof dataPoint[key] === 'string') {
dataPoint[key] = dataPoint[key].replace(/(\r\n|\n|\r)/gm, ' ');
}
});
});
const mappedVectors = allVectors.map((vector: any) => {
return vector._additional.vector
})
fs.writeFileSync('./examples/' + example + '/mapped.json', JSON.stringify(mappedVectors))
const makeTsv = JSON.parse(fs.readFileSync('./examples/' + example + '/mapped.json', 'utf8'))
const vectors = new Parser("\t", { header: false }).stringify(makeTsv)
fs.writeFileSync('./examples/' + example + '/vectors.tsv', vectors)
allVectors.forEach((dataPoint: any) => {
delete dataPoint._additional
});
const metadata = TSV.stringify(allVectors)
fs.writeFileSync('./examples/' + example + '/metadata.tsv', metadata)
}
async function getAllVectors(className: string, properties: string, limit: number) {
const response = await client.graphql
.get()
.withClassName(className)
.withFields( properties + ' _additional {vector}' )
.withLimit(limit)
.do()
.catch((err: Error) => {
console.error(err)
});
if ( response ) {
return response.data.Get[className]
}
}
function main() {
if ( process.argv[2] == 'populate') {
populateData(process.argv[3]);
} else if ( process.argv[2] == 'tsv') {
createTsvFiles(process.argv[3]);
}
}
main();