From 0530db8d7fb8f3e81a634daa2c5c67c49a6c78fe Mon Sep 17 00:00:00 2001 From: Matthew Nelson Date: Fri, 9 Jan 2026 07:49:52 -0500 Subject: [PATCH] Add UTF8.CharPreProcessor extension functions for partial text calculations --- library/utf8/api/utf8.api | 12 ++ library/utf8/api/utf8.klib.api | 6 + .../io/matthewnelson/encoding/utf8/UTF8.kt | 176 ++++++++++++++++++ 3 files changed, 194 insertions(+) diff --git a/library/utf8/api/utf8.api b/library/utf8/api/utf8.api index 1c588d3a..ee52e942 100644 --- a/library/utf8/api/utf8.api +++ b/library/utf8/api/utf8.api @@ -30,14 +30,20 @@ public class io/matthewnelson/encoding/utf8/UTF8$CharPreProcessor { protected fun replacementSize ()I protected final fun setCurrentSize (J)V public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$Config;)J + public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$Config;II)J public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J + public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;II)J public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8;)J + public static final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8;II)J public static final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8$Config;)J public static final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J public static final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8;)J public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$Config;)J + public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$Config;II)J public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J + public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;II)J public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8;)J + public static final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8;II)J } public final class io/matthewnelson/encoding/utf8/UTF8$CharPreProcessor$Companion { @@ -45,14 +51,20 @@ public final class io/matthewnelson/encoding/utf8/UTF8$CharPreProcessor$Companio public final fun of (Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)Lio/matthewnelson/encoding/utf8/UTF8$CharPreProcessor; public final fun of (Lio/matthewnelson/encoding/utf8/UTF8;)Lio/matthewnelson/encoding/utf8/UTF8$CharPreProcessor; public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$Config;)J + public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$Config;II)J public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J + public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;II)J public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8;)J + public final fun sizeOf (Ljava/lang/CharSequence;Lio/matthewnelson/encoding/utf8/UTF8;II)J public final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8$Config;)J public final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J public final fun sizeOf (Lkotlin/collections/CharIterator;Lio/matthewnelson/encoding/utf8/UTF8;)J public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$Config;)J + public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$Config;II)J public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;)J + public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8$ReplacementStrategy;II)J public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8;)J + public final fun sizeOf ([CLio/matthewnelson/encoding/utf8/UTF8;II)J } public final class io/matthewnelson/encoding/utf8/UTF8$Config : io/matthewnelson/encoding/core/EncoderDecoder$Config { diff --git a/library/utf8/api/utf8.klib.api b/library/utf8/api/utf8.klib.api index 02a1964c..42c2a6bf 100644 --- a/library/utf8/api/utf8.klib.api +++ b/library/utf8/api/utf8.klib.api @@ -61,14 +61,20 @@ open class io.matthewnelson.encoding.utf8/UTF8 : io.matthewnelson.encoding.core/ final object Companion { // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion|null[0] final fun (kotlin.collections/CharIterator).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.collections.CharIterator(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy){}[0] final fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy){}[0] + final fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy;kotlin.Int;kotlin.Int){}[0] final fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy){}[0] + final fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy;kotlin.Int;kotlin.Int){}[0] final fun of(io.matthewnelson.encoding.utf8/UTF8.ReplacementStrategy): io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.of|of(io.matthewnelson.encoding.utf8.UTF8.ReplacementStrategy){}[0] final inline fun (kotlin.collections/CharIterator).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.collections.CharIterator(io.matthewnelson.encoding.utf8.UTF8){}[0] final inline fun (kotlin.collections/CharIterator).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.Config): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.collections.CharIterator(io.matthewnelson.encoding.utf8.UTF8.Config){}[0] final inline fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8){}[0] + final inline fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8;kotlin.Int;kotlin.Int){}[0] final inline fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.Config): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8.Config){}[0] + final inline fun (kotlin/CharArray).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.Config, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharArray(io.matthewnelson.encoding.utf8.UTF8.Config;kotlin.Int;kotlin.Int){}[0] final inline fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8){}[0] + final inline fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8;kotlin.Int;kotlin.Int){}[0] final inline fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.Config): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8.Config){}[0] + final inline fun (kotlin/CharSequence).sizeUTF8(io.matthewnelson.encoding.utf8/UTF8.Config, kotlin/Int, kotlin/Int): kotlin/Long // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.sizeUTF8|sizeUTF8@kotlin.CharSequence(io.matthewnelson.encoding.utf8.UTF8.Config;kotlin.Int;kotlin.Int){}[0] final inline fun of(io.matthewnelson.encoding.utf8/UTF8): io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.of|of(io.matthewnelson.encoding.utf8.UTF8){}[0] final inline fun of(io.matthewnelson.encoding.utf8/UTF8.Config): io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor // io.matthewnelson.encoding.utf8/UTF8.CharPreProcessor.Companion.of|of(io.matthewnelson.encoding.utf8.UTF8.Config){}[0] } diff --git a/library/utf8/src/commonMain/kotlin/io/matthewnelson/encoding/utf8/UTF8.kt b/library/utf8/src/commonMain/kotlin/io/matthewnelson/encoding/utf8/UTF8.kt index 30444846..430206cf 100644 --- a/library/utf8/src/commonMain/kotlin/io/matthewnelson/encoding/utf8/UTF8.kt +++ b/library/utf8/src/commonMain/kotlin/io/matthewnelson/encoding/utf8/UTF8.kt @@ -295,6 +295,10 @@ public open class UTF8: EncoderDecoder { * A helper for calculating the exact output byte-size of a text to UTF-8 byte transformation. * */ public open class CharPreProcessor private constructor( + + /** + * The strategy to use when encountering invalid character sequences. + * */ @JvmField public val strategy: ReplacementStrategy, ) { @@ -334,6 +338,10 @@ public open class UTF8: EncoderDecoder { * Calculate the UTF-8 byte output size for the provided array and [ReplacementStrategy] for the * [UTF8] encoder/decoder. * + * @param [utf8] The encoder/decoder to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -345,6 +353,10 @@ public open class UTF8: EncoderDecoder { * Calculate the UTF-8 byte output size for the provided array and [ReplacementStrategy] for the * [UTF8.Config]. * + * @param [config] The [Config] to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -355,6 +367,10 @@ public open class UTF8: EncoderDecoder { /** * Calculate the UTF-8 byte output size for the provided array and [ReplacementStrategy]. * + * @param [strategy] The [ReplacementStrategy]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -366,10 +382,82 @@ public open class UTF8: EncoderDecoder { return cpp.doFinal() } + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided array, and [ReplacementStrategy] for the [UTF8] encoder/decoder. + * + * @param [utf8] The encoder/decoder to retrieve the [ReplacementStrategy] from. + * @param [offset] The index in the array to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public inline fun CharArray.sizeUTF8(utf8: UTF8, offset: Int, len: Int): Long { + return sizeUTF8(utf8.config.replacementStrategy, offset, len) + } + + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided array, and [ReplacementStrategy] for the [UTF8.Config]. + * + * @param [config] The [Config] to retrieve the [ReplacementStrategy] from. + * @param [offset] The index in the array to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public inline fun CharArray.sizeUTF8(config: Config, offset: Int, len: Int): Long { + return sizeUTF8(config.replacementStrategy, offset, len) + } + + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided array, and [ReplacementStrategy]. + * + * @param [strategy] The [ReplacementStrategy]. + * @param [offset] The index in the array to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public fun CharArray.sizeUTF8(strategy: ReplacementStrategy, offset: Int, len: Int): Long { + if (offset < 0) throw IndexOutOfBoundsException("offset[$offset] < 0") + if (len < 0) throw IndexOutOfBoundsException("len[$len] < 0") + if (offset > size - len) { + throw IndexOutOfBoundsException("offset[$offset] > size[$size] - len[$len]") + } + + val cpp = of(strategy) + repeat(len) { i -> cpp + this[i + offset] } + return cpp.doFinal() + } + /** * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy] for the * [UTF8] encoder/decoder. * + * @param [utf8] The encoder/decoder to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -381,6 +469,10 @@ public open class UTF8: EncoderDecoder { * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy] for the * [UTF8.Config]. * + * @param [config] The [Config] to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -391,6 +483,10 @@ public open class UTF8: EncoderDecoder { /** * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy]. * + * @param [strategy] The [ReplacementStrategy]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -402,10 +498,82 @@ public open class UTF8: EncoderDecoder { return cpp.doFinal() } + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided sequence, and [ReplacementStrategy] for the [UTF8] encoder/decoder. + * + * @param [utf8] The encoder/decoder to retrieve the [ReplacementStrategy] from. + * @param [offset] The index in the sequence to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public inline fun CharSequence.sizeUTF8(utf8: UTF8, offset: Int, len: Int): Long { + return sizeUTF8(utf8.config.replacementStrategy, offset, len) + } + + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided sequence, and [ReplacementStrategy] for the [UTF8.Config]. + * + * @param [config] The [Config] to retrieve the [ReplacementStrategy] from. + * @param [offset] The index in the sequence to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public inline fun CharSequence.sizeUTF8(config: Config, offset: Int, len: Int): Long { + return sizeUTF8(config.replacementStrategy, offset, len) + } + + /** + * Calculate the UTF-8 byte output size for [len] number of characters, starting at index [offset], + * from the provided sequence, and [ReplacementStrategy]. + * + * @param [strategy] The [ReplacementStrategy]. + * @param [offset] The index in the sequence to start at. + * @param [len] The number of characters, starting at index [offset]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * + * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the + * [strategy] is [ReplacementStrategy.THROW]. + * @throws [IndexOutOfBoundsException] If [offset] or [len] are inappropriate. + * */ + @JvmStatic + @JvmName("sizeOf") + public fun CharSequence.sizeUTF8(strategy: ReplacementStrategy, offset: Int, len: Int): Long { + if (offset < 0) throw IndexOutOfBoundsException("offset[$offset] < 0") + if (len < 0) throw IndexOutOfBoundsException("len[$len] < 0") + if (offset > length - len) { + throw IndexOutOfBoundsException("offset[$offset] > length[$length] - len[$len]") + } + + val cpp = of(strategy) + repeat(len) { i -> cpp + this[i + offset] } + return cpp.doFinal() + } + /** * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy] for the * [UTF8] encoder/decoder. * + * @param [utf8] The encoder/decoder to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -417,6 +585,10 @@ public open class UTF8: EncoderDecoder { * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy] for the * [UTF8.Config]. * + * @param [config] The [Config] to retrieve the [ReplacementStrategy] from. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */ @@ -427,6 +599,10 @@ public open class UTF8: EncoderDecoder { /** * Calculate the UTF-8 byte output size for the provided characters and [ReplacementStrategy]. * + * @param [strategy] The [ReplacementStrategy]. + * + * @return The exact number of UTF-8 bytes this transformation would result in. + * * @throws [MalformedEncodingException] If an invalid character sequence is encountered and the * [strategy] is [ReplacementStrategy.THROW]. * */