You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

36 lines
1.3 KiB

// For licensing see accompanying LICENSE.md file.
// Copyright (C) 2022 Apple Inc. All Rights Reserved.
import Foundation
@available(iOS 16.2, macOS 13.1, *)
extension BPETokenizer {
enum FileReadError: Error {
case invalidMergeFileLine(Int)
}
/// Read vocab.json file at URL into a dictionary mapping a String to its Int token id
static func readVocabulary(url: URL) throws -> [String: Int] {
let content = try Data(contentsOf: url)
return try JSONDecoder().decode([String: Int].self, from: content)
}
/// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
static func readMerges(url: URL) throws -> [TokenPair: Int] {
let content = try String(contentsOf: url)
let lines = content.split(separator: "\n")
let merges: [(TokenPair, Int)] = try lines.enumerated().compactMap { (index, line) in
if line.hasPrefix("#") {
return nil
}
let pair = line.split(separator: " ")
if pair.count != 2 {
throw FileReadError.invalidMergeFileLine(index+1)
}
return (TokenPair(String(pair[0]), String(pair[1])),index)
}
return [TokenPair : Int](uniqueKeysWithValues: merges)
}
}