feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735

PROBLEM:
There are several limitations to vim.str_byteindex, vim.str_utfindex:
1. They throw given out-of-range indexes. An invalid (often user/lsp-provided)
   index doesn't feel exceptional and should be handled by the caller.
   `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred
   way to handle this kind of failure.
2. They cannot accept an encoding. So LSP needs wrapper functions. #25272
3. The current signatures are not extensible.
    * Calling: The function currently uses a fairly opaque boolean value to
      indicate to identify the encoding.
    * Returns: The fact it can throw requires wrapping in pcall.
4. The current name doesn't follow suggestions in `:h dev-naming` and I think
   `get` would be suitable.

SOLUTION:
- Because these are performance-sensitive, don't introduce `opts`.
- Introduce an "overload" that accepts `encoding:string` and
  `strict_indexing:bool` params.

```lua
local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]])
```

Support the old versions by dispatching on the type of argument 2, and
deprecate that form.

```lua
vim.str_utfindex(line)                             -- (utf-32 length, utf-16 length), deprecated
vim.str_utfindex(line, index)                      -- (utf-32 index, utf-16 index), deprecated
vim.str_utfindex(line, 'utf-16')                   -- utf-16 length
vim.str_utfindex(line, 'utf-16', index)            -- utf-16 index
vim.str_utfindex(line, 'utf-16', math.huge)        -- error: index out of range
vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length
```
This commit is contained in:
Tristan Knight
2024-10-23 14:33:57 +01:00
committed by GitHub
parent 3a86b60032
commit 230b0c7f02
5 changed files with 283 additions and 68 deletions

View File

@ -68,6 +68,12 @@ vim.log = {
},
}
local utfs = {
['utf-8'] = true,
['utf-16'] = true,
['utf-32'] = true,
}
-- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit})
--- Runs a system command or throws an error if {cmd} cannot be run.
---
@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf)
end
end
--- Generates a list of possible completions for the string.
--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index.
--- If {strict_indexing} is false
--- then then an out of range index will return byte length
--- instead of throwing an error.
---
--- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
--- An {index} in the middle of a UTF-16 sequence is rounded upwards to
--- the end of that sequence.
---@param s string
---@param encoding "utf-8"|"utf-16"|"utf-32"
---@param index integer
---@param strict_indexing? boolean # default: true
---@return integer
function vim.str_byteindex(s, encoding, index, strict_indexing)
if type(encoding) == 'number' then
-- Legacy support for old API
-- Parameters: ~
-- • {str} (`string`)
-- • {index} (`integer`)
-- • {use_utf16} (`boolean?`)
local old_index = encoding
local use_utf16 = index or false
return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range')
end
vim.validate('s', s, 'string')
vim.validate('index', index, 'number')
local len = #s
if index == 0 or len == 0 then
return 0
end
vim.validate('encoding', encoding, function(v)
return utfs[v], 'invalid encoding'
end)
vim.validate('strict_indexing', strict_indexing, 'boolean', true)
if strict_indexing == nil then
strict_indexing = true
end
if encoding == 'utf-8' then
if index > len then
return strict_indexing and error('index out of range') or len
end
return index
end
return vim.__str_byteindex(s, index, encoding == 'utf-16')
or strict_indexing and error('index out of range')
or len
end
--- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
--- supplied, the length of the string is used. All indices are zero-based.
---
--- If {strict_indexing} is false then an out of range index will return string
--- length instead of throwing an error.
--- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point
--- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
--- that sequence.
---@param s string
---@param encoding "utf-8"|"utf-16"|"utf-32"
---@param index? integer
---@param strict_indexing? boolean # default: true
---@return integer
function vim.str_utfindex(s, encoding, index, strict_indexing)
if encoding == nil or type(encoding) == 'number' then
-- Legacy support for old API
-- Parameters: ~
-- • {str} (`string`)
-- • {index} (`integer?`)
local old_index = encoding
local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]]
if not col32 or not col16 then
error('index out of range')
end
-- Return (multiple): ~
-- (`integer`) UTF-32 index
-- (`integer`) UTF-16 index
return col32, col16
end
vim.validate('s', s, 'string')
vim.validate('index', index, 'number', true)
if not index then
index = math.huge
strict_indexing = false
end
if index == 0 then
return 0
end
vim.validate('encoding', encoding, function(v)
return utfs[v], 'invalid encoding'
end)
vim.validate('strict_indexing', strict_indexing, 'boolean', true)
if strict_indexing == nil then
strict_indexing = true
end
if encoding == 'utf-8' then
local len = #s
return index <= len and index or (strict_indexing and error('index out of range') or len)
end
local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]]
local col = encoding == 'utf-16' and col16 or col32
if col then
return col
end
if strict_indexing then
error('index out of range')
end
local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]]
return encoding == 'utf-16' and max16 or max32
end
--- Generates a list of possible completions for the str
--- String has the pattern.
---
--- 1. Can we get it to just return things in the global namespace with that name prefix