Skip to content

Commit

Permalink
Add the components governance file cgmanifest.json for tokenizer's …
Browse files Browse the repository at this point in the history
…vocab files (#7283)

* Add the governance file cgmanifest.json for tokenizer's vocab files

* Address the feedback

* apply more schema requirements on the doc
  • Loading branch information
tarekgh authored Nov 1, 2024
1 parent a9b4212 commit 7cce753
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 6 deletions.
4 changes: 2 additions & 2 deletions THIRD-PARTY-NOTICES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

License notice for OpenAI Tiktoken Tokenizer
--------------------------------------------
License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files
----------------------------------------------------------------------

https://github.com/openai/tiktoken/blob/main/LICENSE

Expand Down
54 changes: 54 additions & 0 deletions cgmanifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"$schema": "https://json.schemastore.org/component-detection-manifest.json",
"version": 1,
"registrations": [
{
"component": {
"type": "other",
"other": {
"name": "cl100k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
"hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "o200k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
"hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "p50k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "r50k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
"hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808"
}
},
"developmentDependency": false
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
<!--
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
The files are downloaded from the following sources and compressed to the Destination.
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
- gpt2.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1153,7 +1153,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken". Gpt2 is using the same encoding as R50kBase
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"

internal const string Cl100kBaseEncodingName = "cl100k_base";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ public void TestMissingDataPackages(string modelName, string packageName)

public static IEnumerable<object[]> ModelUrlData()
{
// Gpt2 is covered by the r50k_base.tiktoken file
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public async Task TestTokenizerCreation()
public static IEnumerable<object[]> ModelUrlData()
{
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
yield return new object[] { GPT2, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; // GPT2 uses the same encoding as R50kBase
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
Expand Down

0 comments on commit 7cce753

Please sign in to comment.