Files
llm-in-text/completions-sample-code/prompt/src/snippetInclusion/similarFiles.ts
ydy0615 ba49f82953 Add tokenization and context provider API types
- Implemented window delineation tests for indentation-based tokenization.
- Created tokenizer module with various tokenization strategies including TTokenizer and ApproximateTokenizer.
- Added type definitions for authentication parameters and code citation notifications.
- Introduced context provider API for extensions to supply additional context items to Copilot.
- Defined core types and schemas for position and range.
- Established status types for agent status management in IDEs.
2026-01-18 10:24:32 +08:00

120 lines
3.8 KiB
TypeScript

/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import { DocumentInfoWithOffset, SimilarFileInfo } from '../prompt';
import { FixedWindowSizeJaccardMatcher } from './jaccardMatching';
import { SnippetWithProviderInfo } from './snippets';
import { BlockTokenSubsetMatcher } from './subsetMatching';
const DEFAULT_SNIPPET_THRESHOLD = 0.0;
const DEFAULT_SNIPPET_WINDOW_SIZE = 60;
const DEFAULT_MAX_TOP_SNIPPETS = 4;
const DEFAULT_MAX_SNIPPETS_PER_FILE = 1;
const DEFAULT_MAX_NUMBER_OF_FILES = 20;
const DEFAULT_MAX_CHARACTERS_PER_FILE = 10000;
export interface SimilarFilesOptions {
snippetLength: number;
threshold: number;
maxTopSnippets: number;
maxCharPerFile: number;
maxNumberOfFiles: number;
maxSnippetsPerFile: number;
useSubsetMatching?: boolean;
}
export const defaultSimilarFilesOptions: SimilarFilesOptions = {
snippetLength: DEFAULT_SNIPPET_WINDOW_SIZE,
threshold: DEFAULT_SNIPPET_THRESHOLD,
maxTopSnippets: DEFAULT_MAX_TOP_SNIPPETS,
maxCharPerFile: DEFAULT_MAX_CHARACTERS_PER_FILE,
maxNumberOfFiles: DEFAULT_MAX_NUMBER_OF_FILES,
maxSnippetsPerFile: DEFAULT_MAX_SNIPPETS_PER_FILE,
useSubsetMatching: false,
};
export const conservativeFilesOptions: SimilarFilesOptions = {
snippetLength: 10,
threshold: 0.3,
maxTopSnippets: 1,
maxCharPerFile: DEFAULT_MAX_CHARACTERS_PER_FILE,
maxNumberOfFiles: DEFAULT_MAX_NUMBER_OF_FILES,
maxSnippetsPerFile: 1,
};
export const nullSimilarFilesOptions: SimilarFilesOptions = {
snippetLength: 0,
threshold: 1,
maxTopSnippets: 0,
maxCharPerFile: 0,
maxNumberOfFiles: 0,
maxSnippetsPerFile: 0,
};
// Default similarity parameters for languageId === 'cpp'.
export const defaultCppSimilarFilesOptions: SimilarFilesOptions = {
snippetLength: 60,
threshold: 0.0,
maxTopSnippets: 16,
maxCharPerFile: 100000,
maxNumberOfFiles: 200,
maxSnippetsPerFile: 4,
};
function getMatcher(doc: DocumentInfoWithOffset, selection: SimilarFilesOptions) {
const matcherFactory = selection.useSubsetMatching
? BlockTokenSubsetMatcher.FACTORY(selection.snippetLength)
: FixedWindowSizeJaccardMatcher.FACTORY(selection.snippetLength);
return matcherFactory.to(doc);
}
/**
* @returns A SnippetWithProviderInfo describing the best matches from similar files.
*/
export async function getSimilarSnippets(
doc: DocumentInfoWithOffset,
similarFiles: SimilarFileInfo[],
options: SimilarFilesOptions
): Promise<SnippetWithProviderInfo[]> {
const matcher = getMatcher(doc, options);
if (options.maxTopSnippets === 0) {
return [];
}
const snippets = (
await similarFiles
// filter out absurdly long or absurdly many open files
.filter(similarFile => similarFile.source.length < options.maxCharPerFile && similarFile.source.length > 0)
// slice(0) duplicates an array
.slice(0, options.maxNumberOfFiles)
.reduce(
async (
acc,
similarFile // accumulator of all snippets from all similarFiles
) =>
(await acc).concat(
(await matcher.findMatches(similarFile, options.maxSnippetsPerFile)).map(snippet => ({
relativePath: similarFile.relativePath,
...snippet,
}))
),
Promise.resolve([] as SnippetWithProviderInfo[])
)
)
.filter(
similarFile =>
// remove files that had no match at all
similarFile.score &&
similarFile.snippet &&
// remove files that had a low score
similarFile.score > options.threshold
)
// order them with best (highest scores) last
.sort((a, b) => a.score - b.score)
// take the best options from the end
.slice(-options.maxTopSnippets);
return snippets;
}