@@ -66,17 +66,19 @@ local OFF_SIDE_RULE_LANGUAGES = {
6666 ' fsharp' ,
6767}
6868
69- local TOP_SYMBOLS = 100
70- local TOP_RELATED = 25
69+ local MIN_SYMBOL_SIMILARITY = 0.4 -- Symbol-based matching can be more lenient
70+ local MIN_SEMANTIC_SIMILARITY = 0.3 -- Cosine similarity should be stricter for relevance
7171local MULTI_FILE_THRESHOLD = 5
72+ local MAX_FILES = 2500
7273
7374--- Compute the cosine similarity between two vectors
7475--- @param a table<number>
7576--- @param b table<number>
77+ --- @param def number
7678--- @return number
77- local function spatial_distance_cosine (a , b )
79+ local function spatial_distance_cosine (a , b , def )
7880 if not a or not b then
79- return 0
81+ return def or 0
8082 end
8183
8284 local dot_product = 0
9597--- Rank data by relatedness to the query
9698--- @param query CopilotChat.context.embed
9799--- @param data table<CopilotChat.context.embed>
98- --- @param top_n number
100+ --- @param min_similarity number
99101--- @return table<CopilotChat.context.embed>
100- local function data_ranked_by_relatedness (query , data , top_n )
101- data = vim . tbl_map ( function ( item )
102- return vim . tbl_extend (
103- ' force ' ,
104- item ,
105- { score = item . score or spatial_distance_cosine ( item . embedding , query . embedding ) }
106- )
107- end , data )
102+ local function data_ranked_by_relatedness (query , data , min_similarity )
103+ local results = {}
104+ for _ , item in ipairs ( data ) do
105+ local similarity = spatial_distance_cosine ( item . embedding , query . embedding , item . score )
106+ if similarity >= min_similarity then
107+ table.insert ( results , vim . tbl_extend ( ' force ' , item , { score = similarity }))
108+ end
109+ end
108110
109- table.sort (data , function (a , b )
111+ table.sort (results , function (a , b )
110112 return a .score > b .score
111113 end )
112114
113- return vim . list_slice ( data , 1 , top_n )
115+ return results
114116end
115117
116118--- Rank data by symbols
117- --- @param query string
119+ --- @param prompt string
118120--- @param data table<CopilotChat.context.embed>
119- --- @param top_n number
120- local function data_ranked_by_symbols (query , data , top_n )
121+ --- @param min_similarity number
122+ --- @return table<CopilotChat.context.embed>
123+ local function data_ranked_by_symbols (prompt , data , min_similarity )
121124 local query_terms = {}
122- for term in query :lower ():gmatch (' %w+' ) do
125+ for term in prompt :lower ():gmatch (' %w+' ) do
123126 query_terms [term ] = true
124127 end
125128
126129 local results = {}
127130 for _ , entry in ipairs (data ) do
128- local score = 0
131+ local total_terms = 0
132+ local matched_terms = 0
129133 local filename = entry .filename and entry .filename :lower () or ' '
130134
131- -- Filename matches (highest priority)
135+ -- Calculate similarity score based on term matches
132136 for term in pairs (query_terms ) do
137+ total_terms = total_terms + 1
138+
139+ -- Filename matches
133140 if filename :find (term , 1 , true ) then
134- score = score + 15
141+ matched_terms = matched_terms + 1
135142 if vim .fn .fnamemodify (filename , ' :t' ):gsub (' %..*$' , ' ' ) == term then
136- score = score + 10
143+ matched_terms = matched_terms + 0.5 -- Bonus for exact filename match
137144 end
138145 end
139- end
140146
141- -- Symbol matches
142- if entry .symbols then
143- for _ , symbol in ipairs (entry .symbols ) do
144- for term in pairs (query_terms ) do
145- -- Check symbol name (high priority)
147+ -- Symbol matches
148+ if entry .symbols then
149+ for _ , symbol in ipairs (entry .symbols ) do
146150 if symbol .name and symbol .name :lower ():find (term , 1 , true ) then
147- score = score + 5
151+ matched_terms = matched_terms + 1
148152 if symbol .name :lower () == term then
149- score = score + 3
153+ matched_terms = matched_terms + 0.5 -- Bonus for exact symbol match
150154 end
151155 end
152-
153- -- Check signature (medium priority)
154- -- This catches parameter names, return types, etc
155156 if symbol .signature and symbol .signature :lower ():find (term , 1 , true ) then
156- score = score + 2
157+ matched_terms = matched_terms + 0.5 -- Partial credit for signature matches
157158 end
158159 end
159160 end
160161 end
161162
162- table.insert (results , vim .tbl_extend (' force' , entry , { score = score }))
163+ -- Calculate similarity score (0 to 1 range)
164+ local similarity = matched_terms / (total_terms * 2 ) -- Denominator accounts for potential bonuses
165+
166+ -- Only include results above similarity threshold
167+ if similarity >= min_similarity then
168+ table.insert (results , vim .tbl_extend (' force' , entry , { score = similarity }))
169+ end
163170 end
164171
165172 table.sort (results , function (a , b )
166173 return a .score > b .score
167174 end )
168175
169- return vim . list_slice ( results , 1 , top_n )
176+ return results
170177end
171178
172179--- Get the full signature of a declaration
@@ -326,6 +333,7 @@ function M.files(winnr, with_content)
326333 local files = utils .scan_dir (cwd , {
327334 add_dirs = false ,
328335 respect_gitignore = true ,
336+ max_files = MAX_FILES ,
329337 })
330338
331339 notify .publish (notify .STATUS , ' Reading files' )
@@ -596,7 +604,7 @@ function M.filter_embeddings(prompt, model, embeddings)
596604 end
597605
598606 -- Rank embeddings by symbols
599- embeddings = data_ranked_by_symbols (prompt , embeddings , TOP_SYMBOLS )
607+ embeddings = data_ranked_by_symbols (prompt , embeddings , MIN_SYMBOL_SIMILARITY )
600608 log .debug (' Ranked data:' , # embeddings )
601609 for i , item in ipairs (embeddings ) do
602610 log .debug (string.format (' %s: %s - %s' , i , item .score , item .filename ))
@@ -615,7 +623,7 @@ function M.filter_embeddings(prompt, model, embeddings)
615623 -- Rate embeddings by relatedness to the query
616624 local embedded_query = table.remove (embeddings , # embeddings )
617625 log .debug (' Embedded query:' , embedded_query .content )
618- embeddings = data_ranked_by_relatedness (embedded_query , embeddings , TOP_RELATED )
626+ embeddings = data_ranked_by_relatedness (embedded_query , embeddings , MIN_SEMANTIC_SIMILARITY )
619627 log .debug (' Ranked embeddings:' , # embeddings )
620628 for i , item in ipairs (embeddings ) do
621629 log .debug (string.format (' %s: %s - %s' , i , item .score , item .filename ))
0 commit comments