Files
neovim/runtime/lua/vim/glob.lua
2025-06-03 06:36:44 -07:00

378 lines
12 KiB
Lua
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

--- @brief Glob-to-LPeg Converter (Peglob)
--- This module converts glob patterns to LPeg patterns according to the LSP 3.17 specification:
--- https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#pattern
---
--- Glob grammar overview:
--- - `*` to match zero or more characters in a path segment
--- - `?` to match on one character in a path segment
--- - `**` to match any number of path segments, including none
--- - `{}` to group conditions (e.g. `*.{ts,js}` matches TypeScript and JavaScript files)
--- - `[]` to declare a range of characters to match in a path segment
--- (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …)
--- - `[!...]` to negate a range of characters to match in a path segment
--- (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but not `example.0`)
---
--- Additional constraints:
--- - A Glob pattern must match an entire path, with partial matches
--- considered failures.
--- - The pattern only determines success or failure, without specifying
--- which parts correspond to which characters.
--- - A *path segment* is the portion of a path between two adjacent path
--- separators (`/`), or between the start/end of the path and the nearest
--- separator.
--- - The `**` (*globstar*) pattern matches zero or more path segments,
--- including intervening separators (`/`). Within pattern strings, `**`
--- must be delimited by path separators (`/`) or pattern boundaries and
--- cannot be adjacent to any characters other than `/`. If `**` is not
--- the final element, it must be followed by `/`.
--- - `{}` (*braced conditions*) contains valid Glob patterns as branches,
--- separated by commas. Commas are exclusively used for separating
--- branches and cannot appear within a branch for any other purpose.
--- Nested `{}` structures are allowed, but `{}` must contain at least two
--- branches—zero or one branch is not permitted.
--- - In `[]` or `[!...]`, a *character range* consists of character
--- intervals (e.g., `a-z`) or individual characters (e.g., `w`). A range
--- including `/` wont match that character.
--- @diagnostic disable: missing-fields
local m = vim.lpeg
local mt = getmetatable(m.P(0))
local re = vim.re
local bit = require('bit')
local M = {}
-- Basic patterns for matching glob components
local letter = m.P(1) - m.S('*?[]{}/\\') -- Any character except special glob characters
local slash = m.P '/' * m.Cc(m.P '/') -- Path separator with capture
local notslash = m.P(1) - m.P '/' -- Any character except path separator
local notcomma = m.P(1) - m.S(',\\') -- Any character except comma and backslash
--- Handle EOF, considering whether we're in a segment or not
--- @type vim.lpeg.Pattern
local eof = -1
* m.Cb('inseg')
/ function(flag)
if flag then
return #m.P '/'
else
return m.P(-1)
end
end
---@alias pat_table { F: string?, [1]: string, [2]: vim.lpeg.Pattern }
---@alias seg_part { [string]: any, [integer]: pat_table }
--- @param p pat_table Initial segment pattern data
--- @return seg_part Segment structure with start pattern
local function start_seg(p)
return { s = p[2], e = true, n = 0 }
end
--- @param t seg_part Segment structure
--- @param p pat_table Pattern to look for
--- @return table Updated segment structure
local function lookfor(t, p)
t.n = t.n + 1
t[t.n] = p
return t
end
--- @param t seg_part Segment structure
--- @return table Segment structure with end pattern
local function to_seg_end(t)
t.e = notslash ^ 0
return t
end
--- Constructs a segment matching pattern from collected components
---
--- @param t seg_part Segment structure with patterns
--- @return vim.lpeg.Pattern Complete segment match pattern
local function end_seg(t)
--- @type table<any,any>
local seg_grammar = { 's' }
if t.n > 0 then
seg_grammar.s = t.s
for i = 1, t.n do
local rname = t[i][1]
if not seg_grammar[rname] then
-- Optimize search when deterministic first character is available
if t[i].F then
seg_grammar[rname] = t[i][2] + notslash * (notslash - m.P(t[i].F)) ^ 0 * m.V(rname)
else
seg_grammar[rname] = t[i][2] + notslash * m.V(rname)
end
end
seg_grammar.s = seg_grammar.s * m.V(rname)
end
if t.e then
seg_grammar.s = seg_grammar.s * t.e
end
return m.P(seg_grammar)
else
seg_grammar.s = t.s
if t.e then
seg_grammar.s = seg_grammar.s * t.e
end
return seg_grammar.s
end
end
--- @param p vim.lpeg.Pattern Pattern directly after `**/`
--- @return vim.lpeg.Pattern LPeg pattern for `**/p`
local function dseg(p)
return m.P { p + notslash ^ 0 * m.P '/' * m.V(1) }
end
--- @type (vim.lpeg.Pattern|table)
local g = nil
--- Multiplies conditions for braced expansion (Cartesian product)
---
--- @param a string|string[] First part
--- @param b string|string[] Second part
--- @return string|string[] Cartesian product of values
local function mul_cond(a, b)
if type(a) == 'string' then
if type(b) == 'string' then
return a .. b
elseif type(b) == 'table' then
for i = 1, #b do
b[i] = a .. b[i]
end
return b
else
return a
end
elseif type(a) == 'table' then
if type(b) == 'string' then
for i = 1, #a do
a[i] = a[i] .. b
end
return a
elseif type(b) == 'table' then
--- @type string[]
local res = {}
local idx = 0
for i = 1, #a do
for j = 1, #b do
idx = idx + 1
res[idx] = a[i] .. b[j]
end
end
return res
else
return a
end
else
return b
end
end
--- Combines alternatives in braced patterns
---
--- @param a string|table First part
--- @param b string|table Second part
--- @return table #Combined alternatives
local function add_cond(a, b)
if type(a) == 'string' then
if type(b) == 'string' then
return { a, b }
elseif type(b) == 'table' then
table.insert(b, 1, a)
return b
end
elseif type(a) == 'table' then
if type(b) == 'string' then
table.insert(a, b)
return a
elseif type(b) == 'table' then
for i = 1, #b do
table.insert(a, b[i])
end
return a
end
--- @diagnostic disable-next-line: missing-return
end
end
--- Expands patterns handling segment boundaries
--- `#` prefix is added for sub-grammar to detect in-segment flag
---
---@param a (any[]|vim.lpeg.Pattern[]) Array of patterns
---@param b string Tail string
---@param inseg boolean Whether inside a path segment
---@return vim.lpeg.Pattern #Expanded pattern
local function expand(a, b, inseg)
for i = 1, #a do
if inseg then
a[i] = '#' .. a[i]
end
a[i] = g:match(a[i] .. b)
end
local res = a[1]
for i = 2, #a do
res = res + a[i]
end
return res
end
--- Converts a UTF-8 character to its Unicode codepoint
---
--- @param utf8_str string UTF-8 character
--- @return number #Codepoint value
local function to_codepoint(utf8_str)
local codepoint = 0
local byte_count = 0
for i = 1, #utf8_str do
local byte = utf8_str:byte(i)
if byte_count ~= 0 then
codepoint = bit.bor(bit.lshift(codepoint, 6), bit.band(byte, 0x3F))
byte_count = byte_count - 1
else
if byte < 0x80 then
codepoint = byte
elseif byte < 0xE0 then
byte_count = 1
codepoint = bit.band(byte, 0x1F)
elseif byte < 0xF0 then
byte_count = 2
codepoint = bit.band(byte, 0x0F)
else
byte_count = 3
codepoint = bit.band(byte, 0x07)
end
end
if byte_count == 0 then
break
end
end
return codepoint
end
--- Pattern for matching UTF-8 characters
local cont = m.R('\128\191')
local any_utf8 = m.R('\0\127')
+ m.R('\194\223') * cont
+ m.R('\224\239') * cont * cont
+ m.R('\240\244') * cont * cont * cont
--- Creates a character class pattern for glob ranges
--- @param inv string Inversion flag ('!' or '')
--- @param ranges (string|string[])[] Character ranges
--- @return vim.lpeg.Pattern #Character class pattern
local function class(inv, ranges)
local patt = m.P(false)
if #ranges == 0 then
if inv == '!' then
return m.P '[!]'
else
return m.P '[]'
end
end
for _, v in ipairs(ranges) do
patt = patt + (type(v) == 'table' and m.utfR(to_codepoint(v[1]), to_codepoint(v[2])) or m.P(v))
end
if inv == '!' then
patt = m.P(1) - patt --[[@as vim.lpeg.Pattern]]
end
return patt - m.P '/'
end
-- Parse constraints for optimizing braced conditions
local noopt_condlist = re.compile [[
s <- '/' / '**' / . [^/*]* s
]]
local opt_tail = re.compile [[
s <- (!'**' [^{/])* &'/'
]]
-- stylua: ignore start
--- @nodoc
--- @diagnostic disable
--- Main grammar for glob pattern matching
g = {
'Glob',
Glob = (m.P'#' * m.Cg(m.Cc(true), 'inseg') + m.Cg(m.Cc(false), 'inseg')) *
m.Cf(m.V'Element'^-1 * (slash * m.V'Element')^0 * (slash^-1 * eof), mt.__mul),
-- Elements handle segments, globstar patterns
Element = m.V'DSeg' + m.V'DSEnd' + m.Cf(m.V'Segment' * (slash * m.V'Segment')^0 * (slash * eof + eof^-1), mt.__mul),
-- Globstar patterns
DSeg = m.P'**/' * ((m.V'Element' + eof) / dseg),
DSEnd = m.P'**' * -1 * m.Cc(m.P(1)^0),
-- Segment handling with word and star patterns
Segment = (m.V'Word' / start_seg + m.Cc({ '', true }) / start_seg * (m.V'Star' * m.V'Word' % lookfor)) *
(m.V'Star' * m.V'Word' % lookfor)^0 * (m.V'Star' * m.V'CheckBnd' % to_seg_end)^-1 / end_seg
+ m.V'Star' * m.V'CheckBnd' * m.Cc(notslash^0),
CheckBnd = #m.P'/' + -1, -- Boundary constraint
-- Word patterns for fixed-length matching
Word = -m.P'*' * m.Ct( m.V('FIRST')^-1 * m.C(m.V'WordAux') ),
WordAux = m.V'Branch' + m.Cf(m.V'Simple'^1 * m.V'Branch'^-1, mt.__mul),
Simple = m.Cg( m.V'Token' * (m.V'Token' % mt.__mul)^0 * (m.V'Boundary' % mt.__mul)^-1),
Boundary = #m.P'/' * m.Cc(#m.P'/') + eof,
Token = m.V'Ques' + m.V'Class' + m.V'Escape' + m.V'Literal',
Star = m.P'*',
Ques = m.P'?' * m.Cc(notslash),
Escape = m.P'\\' * m.C(1) / m.P,
Literal = m.C(letter^1) / m.P,
-- Branch handling for braced conditions
Branch = m.Cmt(m.C(m.V'CondList'), function(s, i, p1, p2)
-- Optimize brace expansion when possible
-- p1: string form of condition list, p2: transformed lua table
if noopt_condlist:match(p1) then
-- Cannot optimize, match till the end
return #s + 1, p2, s:sub(i)
end
-- Find point to cut for optimization
local cut = opt_tail:match(s, i)
if cut then
-- Can optimize: match till cut point
-- true flag tells expand to transform EOF matches to &'/' predicates
return cut, p2, s:sub(i, cut - 1), true
else
-- Cannot optimize
return #s + 1, p2, s:sub(i)
end
end) / expand,
-- Brace expansion handling
CondList = m.Cf(m.P'{' * m.V'Cond' * (m.P',' * m.V'Cond')^1 * m.P'}', add_cond),
Cond = m.Cf((m.C((notcomma + m.P'\\' * 1 - m.S'{}')^1) + m.V'CondList')^1, mul_cond) + m.C(true),
-- Character class handling
Class = m.P'[' * m.C(m.P'!'^-1) * m.Ct(
(m.Ct(m.C(any_utf8) * m.P'-' * m.C(any_utf8 - m.P']')) + m.C(any_utf8 - m.P']'))^0
) * m.P']' / class,
-- Deterministic first character extraction for optimization
FIRST = m.Cg(m.P(function(s, i)
if letter:match(s, i) then return true, s:sub(i, i)
else return false end
end), 'F')
}
-- stylua: ignore end
--- @diagnostic enable
--- @nodoc
g = m.P(g)
--- Parses a raw glob into an |lua-lpeg| pattern.
---
---@param pattern string The raw glob pattern
---@return vim.lpeg.Pattern #An |lua-lpeg| representation of the pattern
function M.to_lpeg(pattern)
local lpeg_pattern = g:match(pattern) --[[@as vim.lpeg.Pattern?]]
assert(lpeg_pattern, string.format('Invalid glob: %s', pattern))
return lpeg_pattern
end
return M