You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
35 lines
1.2 KiB
35 lines
1.2 KiB
2 years ago
|
// For licensing see accompanying LICENSE.md file.
|
||
|
// Copyright (C) 2022 Apple Inc. All Rights Reserved.
|
||
|
|
||
|
import Foundation
|
||
|
|
||
|
extension BPETokenizer {
|
||
|
enum FileReadError: Error {
|
||
|
case invalidMergeFileLine(Int)
|
||
|
}
|
||
|
|
||
|
/// Read vocab.json file at URL into a dictionary mapping a String to its Int token id
|
||
|
static func readVocabulary(url: URL) throws -> [String: Int] {
|
||
|
let content = try Data(contentsOf: url)
|
||
|
return try JSONDecoder().decode([String: Int].self, from: content)
|
||
|
}
|
||
|
|
||
|
/// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
|
||
|
static func readMerges(url: URL) throws -> [TokenPair: Int] {
|
||
|
let content = try String(contentsOf: url)
|
||
|
let lines = content.split(separator: "\n")
|
||
|
|
||
|
let merges: [(TokenPair, Int)] = try lines.enumerated().compactMap { (index, line) in
|
||
|
if line.hasPrefix("#") {
|
||
|
return nil
|
||
|
}
|
||
|
let pair = line.split(separator: " ")
|
||
|
if pair.count != 2 {
|
||
|
throw FileReadError.invalidMergeFileLine(index+1)
|
||
|
}
|
||
|
return (TokenPair(String(pair[0]), String(pair[1])),index)
|
||
|
}
|
||
|
return [TokenPair : Int](uniqueKeysWithValues: merges)
|
||
|
}
|
||
|
}
|