Skip to content

Commit 537d732

Browse files
authored
Merge pull request AzureCosmosDB#111 from hsavran/main
PostgreSQL and Generate vector for Mongo DB
2 parents d93bd18 + 3be9df4 commit 537d732

23 files changed

+774
-12
lines changed

.github/actions/build-with-plugins/action.yml

+15
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,21 @@ runs:
151151
-p:PublishReadyToRun=false \
152152
-p:PublishTrimmed=false \
153153
-p:Version=${{ inputs.build-version }}
154+
- name: Build PostgreSQL Extension
155+
shell: bash
156+
run: |
157+
dotnet publish \
158+
Extensions/PostgreSQL/Cosmos.DataTransfer.PostgresqlExtension.csproj \
159+
--configuration Release \
160+
--output ${{ inputs.platform-short }}/Extensions \
161+
--self-contained false \
162+
--runtime ${{ inputs.runtime }} \
163+
-p:PublishSingleFile=false \
164+
-p:DebugType=embedded \
165+
-p:EnableCompressionInSingleFile=true \
166+
-p:PublishReadyToRun=false \
167+
-p:PublishTrimmed=false \
168+
-p:Version=${{ inputs.build-version }}
154169
- name: Upload package
155170
uses: actions/upload-artifact@v3
156171
with:

Core/Cosmos.DataTransfer.Core/Cosmos.DataTransfer.Core.csproj

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
</PropertyGroup>
2020

2121
<ItemGroup>
22-
<PackageReference Include="Azure.Core" Version="1.31.0" />
23-
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.0.0" />
22+
<PackageReference Include="Azure.Core" Version="1.36.0" />
23+
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.2.0" />
2424
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" Version="6.0.1" />
2525
<PackageReference Include="Microsoft.Extensions.Hosting" Version="6.0.1" />
2626
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="6.0.0" />
2727
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
2828
<PackageReference Include="System.CommandLine.Hosting" Version="0.4.0-alpha.22272.1" />
2929
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
30-
<PackageReference Include="System.Configuration.ConfigurationManager" Version="6.0.0" />
30+
<PackageReference Include="System.Configuration.ConfigurationManager" Version="8.0.0" />
3131
</ItemGroup>
3232

3333
<ItemGroup>

Core/Cosmos.DataTransfer.Core/migrationsettings.json

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
{
2-
"Source": null,
3-
"Sink": null,
4-
"SourceSettings": {
2+
"Source": "",
3+
"Sink": "",
4+
"SourceSettings": {
5+
56
},
6-
"SinkSettings": {
7+
"SinkSettings": {
8+
79
},
810
"Operations": [
911
//{

CosmosDbDataMigrationTool.sln

+23-2
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Mongo", "Mongo", "{F18E789A
3939
Extensions\Mongo\README.md = Extensions\Mongo\README.md
4040
EndProjectSection
4141
EndProject
42-
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
42+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoVectorExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoVectorExtension\Cosmos.DataTransfer.MongoVectorExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
4343
EndProject
4444
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.JsonExtension.UnitTests", "Extensions\Json\Cosmos.DataTransfer.JsonExtension.UnitTests\Cosmos.DataTransfer.JsonExtension.UnitTests.csproj", "{ED1E375E-A5A3-47EA-A7D5-07344C7E152F}"
4545
EndProject
@@ -87,14 +87,24 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Csv", "Csv", "{39930280-DA2
8787
EndProject
8888
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension\Cosmos.DataTransfer.CsvExtension.csproj", "{6A3FB90C-B837-4724-A406-214D4CEA686F}"
8989
EndProject
90-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
90+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
91+
EndProject
9192
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{BCBBAF22-0CB5-416B-8C80-03AB2FC4D0A0}"
9293
ProjectSection(SolutionItems) = preProject
9394
Contributing.md = Contributing.md
9495
ExampleConfigs.md = ExampleConfigs.md
9596
README.md = README.md
9697
EndProjectSection
9798
EndProject
99+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.PostgresqlExtension", "Extensions\PostgreSQL\Cosmos.DataTransfer.PostgresqlExtension.csproj", "{85820167-DB94-458B-B09B-9E823996C692}"
100+
EndProject
101+
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PostgreSQL", "PostgreSQL", "{1B927C5F-50FC-42A6-BAF6-B00E6D760543}"
102+
ProjectSection(SolutionItems) = preProject
103+
Extensions\PostgreSQL\README.md = Extensions\PostgreSQL\README.md
104+
EndProjectSection
105+
EndProject
106+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{31BC84E1-55E5-45AA-BFAC-90732F20588B}"
107+
EndProject
98108
Global
99109
GlobalSection(SolutionConfigurationPlatforms) = preSolution
100110
Debug|Any CPU = Debug|Any CPU
@@ -181,6 +191,14 @@ Global
181191
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Debug|Any CPU.Build.0 = Debug|Any CPU
182192
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.ActiveCfg = Release|Any CPU
183193
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.Build.0 = Release|Any CPU
194+
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
195+
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.Build.0 = Debug|Any CPU
196+
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.ActiveCfg = Release|Any CPU
197+
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.Build.0 = Release|Any CPU
198+
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
199+
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.Build.0 = Debug|Any CPU
200+
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.ActiveCfg = Release|Any CPU
201+
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.Build.0 = Release|Any CPU
184202
EndGlobalSection
185203
GlobalSection(SolutionProperties) = preSolution
186204
HideSolutionNode = FALSE
@@ -212,6 +230,9 @@ Global
212230
{39930280-DA29-4814-837B-FA7F252EB3EC} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
213231
{6A3FB90C-B837-4724-A406-214D4CEA686F} = {39930280-DA29-4814-837B-FA7F252EB3EC}
214232
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E} = {39930280-DA29-4814-837B-FA7F252EB3EC}
233+
{85820167-DB94-458B-B09B-9E823996C692} = {1B927C5F-50FC-42A6-BAF6-B00E6D760543}
234+
{1B927C5F-50FC-42A6-BAF6-B00E6D760543} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
235+
{31BC84E1-55E5-45AA-BFAC-90732F20588B} = {F18E789A-D32D-48D3-B75F-1196D7215F74}
215236
EndGlobalSection
216237
GlobalSection(ExtensibilityGlobals) = postSolution
217238
SolutionGuid = {662B3F27-70D8-45E6-A1C0-1438A9C8A542}

Extensions/Cosmos/Cosmos.DataTransfer.CosmosExtension/Cosmos.DataTransfer.CosmosExtension.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
</PropertyGroup>
99

1010
<ItemGroup>
11-
<PackageReference Include="Azure.Identity" Version="1.6.0" />
11+
<PackageReference Include="Azure.Identity" Version="1.10.3" />
1212
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.34.0" />
1313
<PackageReference Include="Microsoft.Extensions.Configuration" Version="6.0.1" />
1414
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="6.0.0" />
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net6.0</TargetFramework>
5+
<ImplicitUsings>enable</ImplicitUsings>
6+
<Nullable>enable</Nullable>
7+
<OutputType>Exe</OutputType>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Azure.AI.OpenAI" Version="1.0.0-beta.12" />
12+
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
13+
<PackageReference Include="MongoDB.Driver" Version="2.19.1" />
14+
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
15+
</ItemGroup>
16+
17+
<ItemGroup>
18+
<ProjectReference Include="..\..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
19+
<ProjectReference Include="..\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj" />
20+
</ItemGroup>
21+
22+
<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
23+
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=PublishToExtensionsFolder" />
24+
</Target>
25+
26+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
using System.ComponentModel.Composition;
2+
using Azure;
3+
using Azure.AI.OpenAI;
4+
using Cosmos.DataTransfer.Interfaces;
5+
using Cosmos.DataTransfer.MongoExtension;
6+
using Cosmos.DataTransfer.MongoVectorExtension.Settings;
7+
using Microsoft.Extensions.Configuration;
8+
using Microsoft.Extensions.Logging;
9+
using MongoDB.Bson;
10+
11+
namespace Cosmos.DataTransfer.MongoVectorExtension;
12+
[Export(typeof(IDataSinkExtension))]
13+
public class MongoVectorDataSinkExtension : IDataSinkExtensionWithSettings
14+
{
15+
public string DisplayName => $"MongoDB-Vector{ExtensionExtensions.BetaExtensionTag}";
16+
17+
public async Task WriteAsync(IAsyncEnumerable<IDataItem> dataItems, IConfiguration config, IDataSourceExtension dataSource, ILogger logger, CancellationToken cancellationToken = default)
18+
{
19+
var settings = config.Get<MongoVectorSinkSettings>();
20+
settings.Validate();
21+
22+
if (!string.IsNullOrEmpty(settings.ConnectionString) && !string.IsNullOrEmpty(settings.DatabaseName) && !string.IsNullOrEmpty(settings.Collection))
23+
{
24+
var Isembeddingsetsvalid = false;
25+
var client = new OpenAIClient("");
26+
if (settings.GenerateEmbedding.HasValue && settings.GenerateEmbedding.Value && settings.SourcePropEmbedding != null && settings.DestPropEmbedding != null)
27+
{
28+
if (!string.IsNullOrEmpty(settings.OpenAIUrl) && !string.IsNullOrEmpty(settings.OpenAIKey) && !string.IsNullOrEmpty(settings.OpenAIDeploymentName))
29+
{
30+
client = new OpenAIClient(new Uri(settings.OpenAIUrl), new AzureKeyCredential(settings.OpenAIKey));
31+
Isembeddingsetsvalid = true;
32+
logger.LogInformation("OpenAI Embedding settings are valid.");
33+
}
34+
}
35+
36+
var context = new Context(settings.ConnectionString, settings.DatabaseName);
37+
var repo = context.GetRepository<BsonDocument>(settings.Collection);
38+
var batchSize = settings.BatchSize ?? 1000;
39+
var objects = new List<BsonDocument>();
40+
int itemCount = 0;
41+
await foreach (var item in dataItems.WithCancellation(cancellationToken))
42+
{
43+
var dict = item.BuildDynamicObjectTree();
44+
45+
if (Isembeddingsetsvalid)
46+
{
47+
var valtoemb = item.GetValue(settings.SourcePropEmbedding)?.ToString();
48+
if (!string.IsNullOrEmpty(valtoemb) && valtoemb?.Length < 8192)
49+
{
50+
var options = new EmbeddingsOptions()
51+
{
52+
DeploymentName = settings.OpenAIDeploymentName,
53+
Input = { valtoemb }
54+
};
55+
var vector = await client.GetEmbeddingsAsync(options,cancellationToken);
56+
if (vector != null)
57+
{
58+
dict?.TryAdd(settings.DestPropEmbedding, vector.Value.Data[0].Embedding.ToArray());
59+
}
60+
}
61+
}
62+
objects.Add(new BsonDocument(dict));
63+
itemCount++;
64+
65+
if (objects.Count == batchSize)
66+
{
67+
await repo.AddRange(objects);
68+
logger.LogInformation("Added {ItemCount} items to collection '{Collection}'", itemCount, settings.Collection);
69+
objects.Clear();
70+
}
71+
}
72+
73+
if (objects.Any())
74+
{
75+
await repo.AddRange(objects);
76+
}
77+
78+
if (itemCount > 0)
79+
logger.LogInformation("Added {ItemCount} total items to collection '{Collection}'", itemCount, settings.Collection);
80+
else
81+
logger.LogWarning("No items added to collection '{Collection}'", settings.Collection);
82+
}
83+
}
84+
85+
public IEnumerable<IDataExtensionSettings> GetSettings()
86+
{
87+
yield return new MongoVectorSinkSettings();
88+
}
89+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Console.WriteLine("Starting Mongo extension");
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<!--
3+
https://go.microsoft.com/fwlink/?LinkID=208121.
4+
-->
5+
<Project>
6+
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
7+
<Configuration>Debug</Configuration>
8+
<Platform>Any CPU</Platform>
9+
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Debug\net6.0\Extensions</PublishDir>
10+
<PublishProtocol>FileSystem</PublishProtocol>
11+
<_TargetId>Folder</_TargetId>
12+
<TargetFramework>net6.0</TargetFramework>
13+
<SelfContained>false</SelfContained>
14+
</PropertyGroup>
15+
<PropertyGroup Condition=" '$(Configuration)' != 'Debug' ">
16+
<Configuration>Release</Configuration>
17+
<Platform>Any CPU</Platform>
18+
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Release\net6.0\Extensions</PublishDir>
19+
<PublishProtocol>FileSystem</PublishProtocol>
20+
<_TargetId>Folder</_TargetId>
21+
<TargetFramework>net6.0</TargetFramework>
22+
<SelfContained>false</SelfContained>
23+
</PropertyGroup>
24+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
using System.ComponentModel.DataAnnotations;
2+
using Cosmos.DataTransfer.MongoExtension.Settings;
3+
4+
namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
5+
public class MongoVectorSinkSettings : MongoBaseSettings
6+
{
7+
[Required]
8+
public string? Collection { get; set; }
9+
10+
public int? BatchSize { get; set; }
11+
12+
public bool? GenerateEmbedding { get; set; }
13+
14+
public string? OpenAIUrl { get; set; }
15+
public string? OpenAIKey { get; set; }
16+
17+
// name of the deployment for text-embedding-ada-002
18+
public string? OpenAIDeploymentName { get; set; }
19+
public string? SourcePropEmbedding { get; set; }
20+
public string? DestPropEmbedding { get; set; }
21+
}

Extensions/Mongo/README.md

+37-1
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,47 @@ Source and sink settings require both `ConnectionString` and `DatabaseName` para
2020

2121
### Sink
2222

23+
```json
24+
{
25+
"ConnectionString": "",
26+
"DatabaseName: "",
27+
"Collection": ""
28+
}
29+
```
30+
31+
# MongoDB Vector Extension (Beta)
32+
33+
The MongoDB Vector extension is a Sink only extension that builds on the MongoDB extension by providing additional capabilities for generating embeddings using Azure OpenAI APIs.
34+
35+
> **Note**: When specifying the MongoDB Vector extension as the Sink property in configuration, utilize the name **MongoDB-Vector(beta)**.
36+
37+
## Settings
38+
39+
The settings are based on the MongoDB extension settings with additional parameters for generating embeddings.
40+
41+
### Additional Sink Settings
42+
43+
The sink settings require the following additional parameters:
44+
45+
- `GenerateEmbedding`: If set to true, the sink will generate embeddings for the records before writing them to the database. The sink requires the `OpenAIUrl`, `OpenAIKey`, and `OpenAIDeploymentModel` parameters to be set. Following paramaters are required if this is true
46+
- `OpenAIUrl`: The URL of the OpenAI API
47+
- `OpenAIKey`: The API key for the OpenAI API
48+
- `OpenAIDeploymentModel`: The deployment model to use for the OpenAI API
49+
- `SourcePropEmbedding`: The property in the source data that should be used to generate the embeddings
50+
- `DestPropEmbedding`: New property name that will be added to the source data with the generated embeddings
51+
2352
```json
2453
{
2554
"ConnectionString": "",
2655
"DatabaseName: "",
2756
"Collection": "",
28-
"BatchSize: 100
57+
"BatchSize: 100,
58+
"GenerateEmbedding": true | false
59+
"OpenAIUrl": "",
60+
"OpenAIKey": "",
61+
"OpenAIDeploymentModel": "",
62+
"SourcePropEmbedding": "",
63+
"DestPropEmbedding": ""
2964
}
3065
```
66+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net6.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
12+
<PackageReference Include="Npgsql" Version="7.0.6" />
13+
<PackageReference Include="System.ComponentModel.Composition" Version="7.0.0" />
14+
</ItemGroup>
15+
16+
<ItemGroup>
17+
<ProjectReference Include="..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
18+
</ItemGroup>
19+
<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
20+
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=FolderProfile" />
21+
</Target>
22+
</Project>

0 commit comments

Comments
 (0)