Skip to content

Commit 754c971

Browse files
committed
perf: improve file and context matching performance
Introduces similarity thresholds and file limits to improve matching performance: - Add MIN_SYMBOL_SIMILARITY (0.4) and MIN_SEMANTIC_SIMILARITY (0.3) thresholds - Implement max_files limit (1000) for directory scanning - Refactor ranking algorithms to filter by similarity instead of taking top N - Improve symbol matching to use normalized similarity scores These changes help prevent performance issues with large codebases while maintaining matching quality by using similarity-based filtering. Signed-off-by: Tomas Slusny <slusnucky@gmail.com>
1 parent e1aab06 commit 754c971

3 files changed

Lines changed: 55 additions & 41 deletions

File tree

lua/CopilotChat/config/contexts.lua

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ return {
5858
local files = utils.scan_dir(cwd, {
5959
add_dirs = false,
6060
respect_gitignore = true,
61+
max_files = 1000,
6162
})
6263

6364
async.util.scheduler()
@@ -76,7 +77,7 @@ return {
7677
input = function(callback)
7778
local choices = utils.kv_list({
7879
list = 'Only lists file names',
79-
full = 'Includes file content for each file found. Can be slow on large workspaces, use with care.',
80+
full = 'Includes file content for each file found, up to a limit.',
8081
})
8182

8283
vim.ui.select(choices, {

lua/CopilotChat/context.lua

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,19 @@ local OFF_SIDE_RULE_LANGUAGES = {
6666
'fsharp',
6767
}
6868

69-
local TOP_SYMBOLS = 100
70-
local TOP_RELATED = 25
69+
local MIN_SYMBOL_SIMILARITY = 0.4 -- Symbol-based matching can be more lenient
70+
local MIN_SEMANTIC_SIMILARITY = 0.3 -- Cosine similarity should be stricter for relevance
7171
local MULTI_FILE_THRESHOLD = 5
72+
local MAX_FILES = 2500
7273

7374
--- Compute the cosine similarity between two vectors
7475
---@param a table<number>
7576
---@param b table<number>
77+
---@param def number
7678
---@return number
77-
local function spatial_distance_cosine(a, b)
79+
local function spatial_distance_cosine(a, b, def)
7880
if not a or not b then
79-
return 0
81+
return def or 0
8082
end
8183

8284
local dot_product = 0
@@ -95,78 +97,83 @@ end
9597
--- Rank data by relatedness to the query
9698
---@param query CopilotChat.context.embed
9799
---@param data table<CopilotChat.context.embed>
98-
---@param top_n number
100+
---@param min_similarity number
99101
---@return table<CopilotChat.context.embed>
100-
local function data_ranked_by_relatedness(query, data, top_n)
101-
data = vim.tbl_map(function(item)
102-
return vim.tbl_extend(
103-
'force',
104-
item,
105-
{ score = item.score or spatial_distance_cosine(item.embedding, query.embedding) }
106-
)
107-
end, data)
102+
local function data_ranked_by_relatedness(query, data, min_similarity)
103+
local results = {}
104+
for _, item in ipairs(data) do
105+
local similarity = spatial_distance_cosine(item.embedding, query.embedding, item.score)
106+
if similarity >= min_similarity then
107+
table.insert(results, vim.tbl_extend('force', item, { score = similarity }))
108+
end
109+
end
108110

109-
table.sort(data, function(a, b)
111+
table.sort(results, function(a, b)
110112
return a.score > b.score
111113
end)
112114

113-
return vim.list_slice(data, 1, top_n)
115+
return results
114116
end
115117

116118
--- Rank data by symbols
117-
---@param query string
119+
---@param prompt string
118120
---@param data table<CopilotChat.context.embed>
119-
---@param top_n number
120-
local function data_ranked_by_symbols(query, data, top_n)
121+
---@param min_similarity number
122+
---@return table<CopilotChat.context.embed>
123+
local function data_ranked_by_symbols(prompt, data, min_similarity)
121124
local query_terms = {}
122-
for term in query:lower():gmatch('%w+') do
125+
for term in prompt:lower():gmatch('%w+') do
123126
query_terms[term] = true
124127
end
125128

126129
local results = {}
127130
for _, entry in ipairs(data) do
128-
local score = 0
131+
local total_terms = 0
132+
local matched_terms = 0
129133
local filename = entry.filename and entry.filename:lower() or ''
130134

131-
-- Filename matches (highest priority)
135+
-- Calculate similarity score based on term matches
132136
for term in pairs(query_terms) do
137+
total_terms = total_terms + 1
138+
139+
-- Filename matches
133140
if filename:find(term, 1, true) then
134-
score = score + 15
141+
matched_terms = matched_terms + 1
135142
if vim.fn.fnamemodify(filename, ':t'):gsub('%..*$', '') == term then
136-
score = score + 10
143+
matched_terms = matched_terms + 0.5 -- Bonus for exact filename match
137144
end
138145
end
139-
end
140146

141-
-- Symbol matches
142-
if entry.symbols then
143-
for _, symbol in ipairs(entry.symbols) do
144-
for term in pairs(query_terms) do
145-
-- Check symbol name (high priority)
147+
-- Symbol matches
148+
if entry.symbols then
149+
for _, symbol in ipairs(entry.symbols) do
146150
if symbol.name and symbol.name:lower():find(term, 1, true) then
147-
score = score + 5
151+
matched_terms = matched_terms + 1
148152
if symbol.name:lower() == term then
149-
score = score + 3
153+
matched_terms = matched_terms + 0.5 -- Bonus for exact symbol match
150154
end
151155
end
152-
153-
-- Check signature (medium priority)
154-
-- This catches parameter names, return types, etc
155156
if symbol.signature and symbol.signature:lower():find(term, 1, true) then
156-
score = score + 2
157+
matched_terms = matched_terms + 0.5 -- Partial credit for signature matches
157158
end
158159
end
159160
end
160161
end
161162

162-
table.insert(results, vim.tbl_extend('force', entry, { score = score }))
163+
-- Calculate similarity score (0 to 1 range)
164+
local similarity = matched_terms / (total_terms * 2) -- Denominator accounts for potential bonuses
165+
166+
-- Only include results above similarity threshold
167+
if similarity >= min_similarity then
168+
table.insert(results, vim.tbl_extend('force', entry, { score = similarity }))
169+
end
163170
end
164171

165172
table.sort(results, function(a, b)
166173
return a.score > b.score
167174
end)
168175

169-
return vim.list_slice(results, 1, top_n)
176+
return results
170177
end
171178

172179
--- Get the full signature of a declaration
@@ -326,6 +333,7 @@ function M.files(winnr, with_content)
326333
local files = utils.scan_dir(cwd, {
327334
add_dirs = false,
328335
respect_gitignore = true,
336+
max_files = MAX_FILES,
329337
})
330338

331339
notify.publish(notify.STATUS, 'Reading files')
@@ -596,7 +604,7 @@ function M.filter_embeddings(prompt, model, embeddings)
596604
end
597605

598606
-- Rank embeddings by symbols
599-
embeddings = data_ranked_by_symbols(prompt, embeddings, TOP_SYMBOLS)
607+
embeddings = data_ranked_by_symbols(prompt, embeddings, MIN_SYMBOL_SIMILARITY)
600608
log.debug('Ranked data:', #embeddings)
601609
for i, item in ipairs(embeddings) do
602610
log.debug(string.format('%s: %s - %s', i, item.score, item.filename))
@@ -615,7 +623,7 @@ function M.filter_embeddings(prompt, model, embeddings)
615623
-- Rate embeddings by relatedness to the query
616624
local embedded_query = table.remove(embeddings, #embeddings)
617625
log.debug('Embedded query:', embedded_query.content)
618-
embeddings = data_ranked_by_relatedness(embedded_query, embeddings, TOP_RELATED)
626+
embeddings = data_ranked_by_relatedness(embedded_query, embeddings, MIN_SEMANTIC_SIMILARITY)
619627
log.debug('Ranked embeddings:', #embeddings)
620628
for i, item in ipairs(embeddings) do
621629
log.debug(string.format('%s: %s - %s', i, item.score, item.filename))

lua/CopilotChat/utils.lua

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,12 @@ M.scan_dir = async.wrap(function(path, opts, callback)
363363
scandir.scan_dir_async(
364364
path,
365365
vim.tbl_deep_extend('force', opts, {
366-
on_exit = callback,
366+
on_exit = function(files)
367+
if opts.max_files then
368+
files = vim.list_slice(files, 1, opts.max_files)
369+
end
370+
callback(files)
371+
end,
367372
})
368373
)
369374
end, 3)

0 commit comments

Comments
 (0)