Merge pull request #10513 from bfredl/bytecount

api/lua: add {byte_count} parameter to line region change event
2025-07-16 01:01:49 +00:00 · 2019-08-06 20:25:46 +02:00
parent dc1359bf8e c0993ed343
commit 6fb0020df4
10 changed files with 309 additions and 38 deletions
--- a/runtime/doc/api.txt
+++ b/runtime/doc/api.txt
@ -200,17 +200,26 @@ User reloads the buffer with ":edit", emits: >
  nvim_buf_detach_event[{buf}]

                                                        *api-buffer-updates-lua*
-In-process lua plugins can also recieve buffer updates, in the form of lua
+In-process lua plugins can also receive buffer updates, in the form of lua
 callbacks. These callbacks are called frequently in various contexts, buffer
 contents or window layout should not be changed inside these |textlock|.
 |vim.schedule| can be used to defer these operations to the main loop, where
 they are allowed.

 |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
-receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline}, {new_lastline}).
-Unlike remote channels the text contents are not passed. The new text can be
-accessed inside the callback as
-`vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
+receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
+{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
+Unlike remote channel events the text contents are not passed. The new text can
+be accessed inside the callback as
+
+    `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
+
+{old_byte_size} is the total size of the replaced region {firstline} to
+{lastline} in bytes, including the final newline after {lastline}. if
+`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
+UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
+arguments {old_utf32_size} and {old_utf16_size}.
+
 "on_changedtick" is invoked when |b:changedtick| was incremented but no text
 was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).

--- a/src/nvim/api/buffer.c
+++ b/src/nvim/api/buffer.c
@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
 ///        `nvim_buf_lines_event`. Otherwise, the first notification will be
 ///        a `nvim_buf_changedtick_event`. Not used for lua callbacks.
 /// @param  opts  Optional parameters.
-///               `on_lines`: lua callback received on change.
+///               `on_lines`:       lua callback received on change.
 ///               `on_changedtick`: lua callback received on changedtick
 ///                                 increment without text change.
+///               `utf_sizes`:      include UTF-32 and UTF-16 size of
+///                                 the replaced region.
 ///               See |api-buffer-updates-lua| for more information
 /// @param[out] err Error details, if any
 /// @return False when updates couldn't be enabled because the buffer isn't
@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
      }
      cb.on_detach = v->data.luaref;
      v->data.integer = LUA_NOREF;
+    } else if (is_lua && strequal("utf_sizes", k.data)) {
+      if (v->type != kObjectTypeBoolean) {
+        api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
+        goto error;
+      }
+      cb.utf_sizes = v->data.boolean;
    } else {
      api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
      goto error;
@ -1176,6 +1184,30 @@ free_exit:
  return 0;
 }

+Dictionary nvim__buf_stats(Buffer buffer, Error *err)
+{
+  Dictionary rv = ARRAY_DICT_INIT;
+
+  buf_T *buf = find_buffer_by_handle(buffer, err);
+  if (!buf) {
+    return rv;
+  }
+
+  // Number of times the cached line was flushed.
+  // This should generally not increase while editing the same
+  // line in the same mode.
+  PUT(rv, "flush_count", INTEGER_OBJ(buf->flush_count));
+  // lnum of current line
+  PUT(rv, "current_lnum", INTEGER_OBJ(buf->b_ml.ml_line_lnum));
+  // whether the line has unflushed changes.
+  PUT(rv, "line_dirty", BOOLEAN_OBJ(buf->b_ml.ml_flags & ML_LINE_DIRTY));
+  // NB: this should be zero at any time API functions are called,
+  // this exists to debug issues
+  PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
+
+  return rv;
+}
+
 // Check if deleting lines made the cursor position invalid.
 // Changed lines from `lo` to `hi`; added `extra` lines (negative if deleted).
 static void fix_cursor(linenr_T lo, linenr_T hi, linenr_T extra)
--- a/src/nvim/buffer_defs.h
+++ b/src/nvim/buffer_defs.h
@ -459,8 +459,9 @@ typedef struct {
  LuaRef on_lines;
  LuaRef on_changedtick;
  LuaRef on_detach;
+  bool utf_sizes;
 } BufUpdateCallbacks;
-#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF }
+#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }

 #define BUF_HAS_QF_ENTRY 1
 #define BUF_HAS_LL_ENTRY 2
@ -802,11 +803,26 @@ struct file_buffer {

  kvec_t(BufhlLine *) b_bufhl_move_space;  // temporary space for highlights

-  // array of channelids which have asked to receive updates for this
+  // array of channel_id:s which have asked to receive updates for this
  // buffer.
  kvec_t(uint64_t) update_channels;
+  // array of lua callbacks for buffer updates.
  kvec_t(BufUpdateCallbacks) update_callbacks;

+  // whether an update callback has requested codepoint size of deleted regions.
+  bool update_need_codepoints;
+
+  // Measurements of the deleted or replaced region since the last update
+  // event. Some consumers of buffer changes need to know the byte size (like
+  // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
+  // deleted text.
+  size_t deleted_bytes;
+  size_t deleted_codepoints;
+  size_t deleted_codeunits;
+
+  // The number for times the current line has been flushed in the memline.
+  int flush_count;
+
  int b_diff_failed;    // internal diff failed for this buffer
 };

--- a/src/nvim/buffer_updates.c
+++ b/src/nvim/buffer_updates.c
@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,

  if (channel_id == LUA_INTERNAL_CALL) {
    kv_push(buf->update_callbacks, cb);
+    if (cb.utf_sizes) {
+      buf->update_need_codepoints = true;
+    }
    return true;
  }

@ -169,6 +172,10 @@ void buf_updates_send_changes(buf_T *buf,
                              int64_t num_removed,
                              bool send_tick)
 {
+  size_t deleted_codepoints, deleted_codeunits;
+  size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
+                                                &deleted_codeunits);
+
  if (!buf_updates_active(buf)) {
    return;
  }
@ -231,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
    bool keep = true;
    if (cb.on_lines != LUA_NOREF) {
      Array args = ARRAY_DICT_INIT;
-      Object items[5];
-      args.size = 5;
+      Object items[8];
+      args.size = 6;  // may be increased to 8 below
      args.items = items;

      // the first argument is always the buffer handle
@ -250,6 +257,13 @@ void buf_updates_send_changes(buf_T *buf,
      // the last line in the updated range
      args.items[4] = INTEGER_OBJ(firstline - 1 + num_added);

+      // byte count of previous contents
+      args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
+      if (cb.utf_sizes) {
+        args.size = 8;
+        args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
+        args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
+      }
      textlock++;
      Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
      textlock--;
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@ -1755,6 +1755,9 @@ failed:
      ml_delete(curbuf->b_ml.ml_line_count, false);
      linecnt--;
    }
+    curbuf->deleted_bytes = 0;
+    curbuf->deleted_codepoints = 0;
+    curbuf->deleted_codeunits = 0;
    linecnt = curbuf->b_ml.ml_line_count - linecnt;
    if (filesize == 0)
      linecnt = 0;
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
 EXTERN int orig_line_count INIT(= 0);       /* Line count when "gR" started */
 EXTERN int vr_lines_changed INIT(= 0);      /* #Lines changed by "gR" so far */

+// increase around internal delete/replace
+EXTERN int inhibit_delete_count INIT(= 0);

 /*
 * These flags are set based upon 'fileencoding'.
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)

 #endif

+/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
+///
+/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
+/// each.
+///
+/// The out parameters are incremented. This is used to measure the size of
+/// a buffer region consisting of multiple line segments.
+///
+/// @param s the string
+/// @param len maximum length (an earlier NUL terminates)
+/// @param[out] codepoints incremented with UTF-32 code point size
+/// @param[out] codeunits incremented with UTF-16 code unit size
+void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
+               size_t *codeunits)
+  FUNC_ATTR_NONNULL_ALL
+{
+  size_t count = 0, extra = 0;
+  size_t clen;
+  for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
+    clen = utf_ptr2len_len(s+i, len-i);
+    // NB: gets the byte value of invalid sequence bytes.
+    // we only care whether the char fits in the BMP or not
+    int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+    count++;
+    if (c > 0xFFFF) {
+      extra++;
+    }
+  }
+  *codepoints += count;
+  *codeunits += count + extra;
+}
+
+
 /*
 * Version of strnicmp() that handles multi-byte characters.
 * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can
--- a/src/nvim/memline.c
+++ b/src/nvim/memline.c
@ -2383,6 +2383,23 @@ static int ml_append_int(
  return OK;
 }

+void ml_add_deleted_len(char_u *ptr, ssize_t len)
+{
+  if (inhibit_delete_count) {
+    return;
+  }
+  if (len == -1) {
+    len = STRLEN(ptr);
+  }
+  curbuf->deleted_bytes += len+1;
+  if (curbuf->update_need_codepoints) {
+    mb_utflen(ptr, len, &curbuf->deleted_codepoints,
+              &curbuf->deleted_codeunits);
+    curbuf->deleted_codepoints++;  // NL char
+    curbuf->deleted_codeunits++;
+  }
+}
+
 /*
 * Replace line lnum, with buffering, in current buffer.
 *
@ -2403,13 +2420,24 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
  if (curbuf->b_ml.ml_mfp == NULL && open_buffer(FALSE, NULL, 0) == FAIL)
    return FAIL;

+  bool readlen = true;
+
  if (copy) {
    line = vim_strsave(line);
  }
-  if (curbuf->b_ml.ml_line_lnum != lnum)            /* other line buffered */
-    ml_flush_line(curbuf);                          /* flush it */
-  else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY)   /* same line allocated */
-    xfree(curbuf->b_ml.ml_line_ptr);             /* free it */
+  if (curbuf->b_ml.ml_line_lnum != lnum) {  // other line buffered
+    ml_flush_line(curbuf);  // flush it
+  } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  // same line allocated
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
+    readlen = false;  // already added the length
+
+    xfree(curbuf->b_ml.ml_line_ptr);  // free it
+  }
+
+  if (readlen && kv_size(curbuf->update_callbacks)) {
+    ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
+  }
+
  curbuf->b_ml.ml_line_ptr = line;
  curbuf->b_ml.ml_line_lnum = lnum;
  curbuf->b_ml.ml_flags = (curbuf->b_ml.ml_flags | ML_LINE_DIRTY) & ~ML_EMPTY;
@ -2491,6 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
  else
    line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;

+  // Line should always have an NL char internally (represented as NUL),
+  // even if 'noeol' is set.
+  assert(line_size >= 1);
+  ml_add_deleted_len((char_u *)dp + line_start, line_size-1);

  /*
   * special case: If there is only one line in the data block it becomes empty.
@ -2676,6 +2708,17 @@ void ml_clearmarked(void)
  return;
 }

+size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
+{
+  size_t ret = buf->deleted_bytes;
+  *codepoints = buf->deleted_codepoints;
+  *codeunits = buf->deleted_codeunits;
+  buf->deleted_bytes = 0;
+  buf->deleted_codepoints = 0;
+  buf->deleted_codeunits = 0;
+  return ret;
+}
+
 /*
 * flush ml_line if necessary
 */
@ -2704,6 +2747,8 @@ static void ml_flush_line(buf_T *buf)
      return;
    entered = TRUE;

+    buf->flush_count++;
+
    lnum = buf->b_ml.ml_line_lnum;
    new_line = buf->b_ml.ml_line_ptr;

--- a/src/nvim/misc1.c
+++ b/src/nvim/misc1.c
@ -780,6 +780,7 @@ open_line (
    did_append = FALSE;
  }

+  inhibit_delete_count++;
  if (newindent
      || did_si
      ) {
@ -821,6 +822,7 @@ open_line (
      did_si = false;
    }
  }
+  inhibit_delete_count--;

  /*
   * In REPLACE mode, for each character in the extra leader, there must be
@ -1685,6 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
  bool was_alloced = ml_line_alloced();     // check if oldp was allocated
  char_u *newp;
  if (was_alloced) {
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
    newp = oldp;                            // use same allocated memory
  } else {                                  // need to allocate a new line
    newp = xmalloc((size_t)(oldlen + 1 - count));
--- a/test/functional/lua/buffer_updates_spec.lua
+++ b/test/functional/lua/buffer_updates_spec.lua
@ -5,28 +5,31 @@ local command = helpers.command
 local meths = helpers.meths
 local clear = helpers.clear
 local eq = helpers.eq
+local exec_lua = helpers.exec_lua
+local feed = helpers.feed

 local origlines = {"original line 1",
                   "original line 2",
                   "original line 3",
                   "original line 4",
                   "original line 5",
-                   "original line 6"}
+                   "original line 6",
+                   "    indented line"}

 describe('lua: buffer event callbacks', function()
  before_each(function()
    clear()
-    meths.execute_lua([[
+    exec_lua([[
      local events = {}

-      function test_register(bufnr, id, changedtick)
+      function test_register(bufnr, id, changedtick, utf_sizes)
        local function callback(...)
          table.insert(events, {id, ...})
          if test_unreg == id then
            return true
          end
        end
-        local opts = {on_lines=callback, on_detach=callback}
+        local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
        if changedtick then
          opts.on_changedtick = callback
        end
@ -38,55 +41,166 @@ describe('lua: buffer event callbacks', function()
        events = {}
        return ret_events
      end
-    ]], {})
+    ]])
  end)

-  it('works', function()
+
+  -- verifying the sizes with nvim_buf_get_offset is nice (checks we cannot
+  -- assert the wrong thing), but masks errors with unflushed lines (as
+  -- nvim_buf_get_offset forces a flush of the memline). To be safe run the
+  -- test both ways.
+  local function check(verify,utf_sizes)
+    local lastsize
    meths.buf_set_lines(0, 0, -1, true, origlines)
-    meths.execute_lua("return test_register(...)", {0, "test1"})
+    if verify then
+      lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
+    end
+    exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
    local tick = meths.buf_get_changedtick(0)

+    local verify_name = "test1"
+    local function check_events(expected)
+      local events = exec_lua("return get_events(...)" )
+      if utf_sizes then
+        -- this test case uses ASCII only, so sizes sshould be the same.
+        -- Unicode is tested below.
+        for _, event in ipairs(expected) do
+          event[9] = event[8]
+          event[10] = event[8]
+        end
+      end
+      eq(expected, events)
+      if verify then
+        for _, event in ipairs(events) do
+          if event[1] == verify_name and event[2] == "lines" then
+            local startline, endline = event[5], event[7]
+            local newrange = meths.buf_get_offset(0, endline) - meths.buf_get_offset(0, startline)
+            local newsize = meths.buf_get_offset(0, meths.buf_line_count(0))
+            local oldrange = newrange + lastsize - newsize
+            eq(oldrange, event[8])
+            lastsize = newsize
+          end
+        end
+      end
+    end
+
+    command('set autoindent')
    command('normal! GyyggP')
    tick = tick + 1
-    eq({{ "test1", "lines", 1, tick, 0, 0, 1 }},
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})

    meths.buf_set_lines(0, 3, 5, true, {"changed line"})
    tick = tick + 1
-    eq({{ "test1", "lines", 1, tick, 3, 5, 4 }},
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})

-    meths.execute_lua("return test_register(...)", {0, "test2", true})
+    exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
    tick = tick + 1
    command('undo')

    -- plugins can opt in to receive changedtick events, or choose
    -- to only recieve actual changes.
-    eq({{ "test1", "lines", 1, tick, 3, 4, 5 },
-        { "test2", "lines", 1, tick, 3, 4, 5 },
-        { "test2", "changedtick", 1, tick+1 } },
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test1", "lines", 1, tick, 3, 4, 5, 13 },
+        { "test2", "lines", 1, tick, 3, 4, 5, 13 },
+        { "test2", "changedtick", 1, tick+1 } })
    tick = tick + 1

    -- simulate next callback returning true
-    meths.execute_lua("test_unreg = 'test1'", {})
+    exec_lua("test_unreg = 'test1'")

    meths.buf_set_lines(0, 6, 7, true, {"x1","x2","x3"})
    tick = tick + 1

    -- plugins can opt in to receive changedtick events, or choose
    -- to only recieve actual changes.
-    eq({{ "test1", "lines", 1, tick, 6, 7, 9 },
-        { "test2", "lines", 1, tick, 6, 7, 9 }},
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test1", "lines", 1, tick, 6, 7, 9, 16 },
+        { "test2", "lines", 1, tick, 6, 7, 9, 16 }})
+
+    verify_name = "test2"

    meths.buf_set_lines(0, 1, 1, true, {"added"})
    tick = tick + 1
-    eq({{ "test2", "lines", 1, tick, 1, 1, 2 }},
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test2", "lines", 1, tick, 1, 1, 2, 0 }})
+
+    feed('wix')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 16 }})
+
+    -- check hot path for multiple insert
+    feed('yz')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 17 }})
+
+    feed('<bs>')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})
+
+    feed('<esc>Go')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
+
+    feed('x')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})

    command('bwipe!')
-    eq({{ "test2", "detach", 1 }},
-       meths.execute_lua("return get_events(...)", {}))
+    check_events({{ "test2", "detach", 1 }})
+   end
+
+  it('works', function()
+    check(false)
  end)
+
+  it('works with verify', function()
+    check(true)
+  end)
+
+  it('works with utf_sizes and ASCII text', function()
+    check(false,true)
+  end)
+
+  it('works with utf_sizes and unicode text', function()
+    local unicode_text = {"ascii text",
+                          "latin text åäö",
+                          "BMP text ɧ αλφά",
+                          "BMP text 汉语 ↥↧",
+                          "SMP 🤦 🦄🦃",
+                          "combining å بِيَّة"}
+    meths.buf_set_lines(0, 0, -1, true, unicode_text)
+    feed('gg')
+    exec_lua("return test_register(...)", 0, "test1", false, true)
+    local tick = meths.buf_get_changedtick(0)
+
+    feed('dd')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
+
+    feed('A<bs>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jylp')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
+
+    feed('+eea<cr>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jdw')
+    tick = tick + 1
+    -- non-BMP chars count as 2 UTF-2 codeunits
+    eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
+
+    feed('+rx')
+    tick = tick + 1
+    -- count the individual codepoints of a composed character.
+    eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+
+    feed('kJ')
+    tick = tick + 1
+    -- NB: this is inefficient (but not really wrong).
+    eq({{ "test1", "lines", 1,   tick, 4, 5, 5, 14, 5, 8 },
+        { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+  end)
+
 end)