@@ -14,6 +14,47 @@ import { getGitHubEnterpriseApiUrl } from './scale-up';
1414
1515const logger = createChildLogger ( 'scale-down' ) ;
1616
17+ const RETRY_CONFIG = {
18+ maxRetries : 3 ,
19+ initialDelayMs : 1000 ,
20+ maxDelayMs : 10000 ,
21+ } ;
22+
23+ async function sleep ( ms : number ) : Promise < void > {
24+ return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
25+ }
26+
27+ function isRetryableError ( error : unknown ) : boolean {
28+ if ( error instanceof RequestError ) {
29+ const status = ( error as RequestError ) . status ;
30+ // Retry on server errors (5xx) and rate limiting (429)
31+ return status >= 500 || status === 429 ;
32+ }
33+ return false ;
34+ }
35+
36+ async function withRetry < T > ( operation : ( ) => Promise < T > , operationName : string , context : string ) : Promise < T > {
37+ let lastError : unknown ;
38+ for ( let attempt = 1 ; attempt <= RETRY_CONFIG . maxRetries ; attempt ++ ) {
39+ try {
40+ return await operation ( ) ;
41+ } catch ( error ) {
42+ lastError = error ;
43+ if ( isRetryableError ( error ) && attempt < RETRY_CONFIG . maxRetries ) {
44+ const delay = Math . min ( RETRY_CONFIG . initialDelayMs * Math . pow ( 2 , attempt - 1 ) , RETRY_CONFIG . maxDelayMs ) ;
45+ logger . warn (
46+ `${ operationName } failed for ${ context } (attempt ${ attempt } /${ RETRY_CONFIG . maxRetries } ), ` +
47+ `retrying in ${ delay } ms. Error: ${ error } ` ,
48+ ) ;
49+ await sleep ( delay ) ;
50+ } else {
51+ throw error ;
52+ }
53+ }
54+ }
55+ throw lastError ;
56+ }
57+
1758type OrgRunnerList = Endpoints [ 'GET /orgs/{org}/actions/runners' ] [ 'response' ] [ 'data' ] [ 'runners' ] ;
1859type RepoRunnerList = Endpoints [ 'GET /repos/{owner}/{repo}/actions/runners' ] [ 'response' ] [ 'data' ] [ 'runners' ] ;
1960type RunnerState = OrgRunnerList [ number ] | RepoRunnerList [ number ] ;
@@ -127,6 +168,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127168 return launchTimePlusMinimum < now ;
128169}
129170
171+ async function deleteGitHubRunner (
172+ githubAppClient : Octokit ,
173+ ec2runner : RunnerInfo ,
174+ ghRunnerId : number ,
175+ ) : Promise < number > {
176+ const deleteOperation = async ( ) => {
177+ const response =
178+ ec2runner . type === 'Org'
179+ ? await githubAppClient . actions . deleteSelfHostedRunnerFromOrg ( {
180+ runner_id : ghRunnerId ,
181+ org : ec2runner . owner ,
182+ } )
183+ : await githubAppClient . actions . deleteSelfHostedRunnerFromRepo ( {
184+ runner_id : ghRunnerId ,
185+ owner : ec2runner . owner . split ( '/' ) [ 0 ] ,
186+ repo : ec2runner . owner . split ( '/' ) [ 1 ] ,
187+ } ) ;
188+ return response . status ;
189+ } ;
190+
191+ return await withRetry (
192+ deleteOperation ,
193+ 'Delete GitHub runner' ,
194+ `runner ${ ec2runner . instanceId } (GitHub ID: ${ ghRunnerId } )` ,
195+ ) ;
196+ }
197+
130198async function removeRunner ( ec2runner : RunnerInfo , ghRunnerIds : number [ ] ) : Promise < void > {
131199 const githubAppClient = await getOrCreateOctokit ( ec2runner ) ;
132200 try {
@@ -146,28 +214,35 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
146214 ) ;
147215
148216 if ( states . every ( ( busy ) => busy === false ) ) {
149- const statuses = await Promise . all (
217+ const results = await Promise . all (
150218 ghRunnerIds . map ( async ( ghRunnerId ) => {
151- return (
152- ec2runner . type === 'Org'
153- ? await githubAppClient . actions . deleteSelfHostedRunnerFromOrg ( {
154- runner_id : ghRunnerId ,
155- org : ec2runner . owner ,
156- } )
157- : await githubAppClient . actions . deleteSelfHostedRunnerFromRepo ( {
158- runner_id : ghRunnerId ,
159- owner : ec2runner . owner . split ( '/' ) [ 0 ] ,
160- repo : ec2runner . owner . split ( '/' ) [ 1 ] ,
161- } )
162- ) . status ;
219+ try {
220+ const status = await deleteGitHubRunner ( githubAppClient , ec2runner , ghRunnerId ) ;
221+ return { ghRunnerId, status, success : status === 204 } ;
222+ } catch ( error ) {
223+ logger . error (
224+ `Failed to de-register GitHub runner ${ ghRunnerId } for instance '${ ec2runner . instanceId } ' after retries. Error: ${ error } ` ,
225+ { error : error as Error } ,
226+ ) ;
227+ return { ghRunnerId, status : 0 , success : false } ;
228+ }
163229 } ) ,
164230 ) ;
165231
166- if ( statuses . every ( ( status ) => status == 204 ) ) {
232+ const allSucceeded = results . every ( ( r ) => r . success ) ;
233+ const failedRunners = results . filter ( ( r ) => ! r . success ) ;
234+
235+ if ( allSucceeded ) {
167236 await terminateRunner ( ec2runner . instanceId ) ;
168237 logger . info ( `AWS runner instance '${ ec2runner . instanceId } ' is terminated and GitHub runner is de-registered.` ) ;
169238 } else {
170- logger . error ( `Failed to de-register GitHub runner: ${ statuses } ` ) ;
239+ // Only terminate EC2 if we successfully de-registered from GitHub
240+ // Otherwise, leave the instance running so the next scale-down cycle can retry
241+ logger . error (
242+ `Failed to de-register ${ failedRunners . length } GitHub runner(s) for instance '${ ec2runner . instanceId } '. ` +
243+ `Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
244+ `Failed runner IDs: ${ failedRunners . map ( ( r ) => r . ghRunnerId ) . join ( ', ' ) } ` ,
245+ ) ;
171246 }
172247 } else {
173248 logger . info ( `Runner '${ ec2runner . instanceId } ' cannot be de-registered, because it is still busy.` ) ;
0 commit comments