[eventLog] retry resource creation at initialization time (#136363)

resolves https://github.com/elastic/kibana/issues/134098

Adds retry logic to the initialization of elasticsearch
resources, when Kibana starts up.  Recently, it seems
this has become a more noticeable error - that race
conditions occur where two Kibana's initializing a new
stack version will race to create the event log resources.

We believe we'll see the end of these issues with some
retries, chunked around the 4 resource-y sections of
the initialization code.

We're using [p-retry][] (which uses [retry][]), to do an
exponential backoff starting at 2s, then 4s, 8s, 16s,
with 4 retries (so 5 actual attempted calls).  Some
randomness is added, since there's a race on.

[p-retry]: https://github.com/sindresorhus/p-retry#p-retry
[retry]: https://github.com/tim-kos/node-retry#retry

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Patrick Mueller 2022-07-19 10:26:13 -04:00 committed by GitHub
parent 407c105fa2
commit f6e4c2f806
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 131 additions and 9 deletions

View file

@ -12,6 +12,8 @@ import { namesMock } from './names.mock';
import { IClusterClientAdapter } from './cluster_client_adapter';
import { clusterClientAdapterMock } from './cluster_client_adapter.mock';
export const MOCK_RETRY_DELAY = 20;
const createContextMock = () => {
const mock: jest.Mocked<EsContext> & {
esAdapter: jest.Mocked<IClusterClientAdapter>;
@ -23,6 +25,7 @@ const createContextMock = () => {
waitTillReady: jest.fn(async () => true),
esAdapter: clusterClientAdapterMock.create(),
initialized: true,
retryDelay: MOCK_RETRY_DELAY,
};
return mock;
};

View file

@ -12,14 +12,17 @@ import { initializeEs } from './init';
import { ClusterClientAdapter, IClusterClientAdapter } from './cluster_client_adapter';
import { createReadySignal, ReadySignal } from '../lib/ready_signal';
export const RETRY_DELAY = 2000;
export interface EsContext {
logger: Logger;
esNames: EsNames;
esAdapter: IClusterClientAdapter;
readonly logger: Logger;
readonly esNames: EsNames;
readonly esAdapter: IClusterClientAdapter;
initialize(): void;
shutdown(): Promise<void>;
waitTillReady(): Promise<boolean>;
initialized: boolean;
readonly initialized: boolean;
readonly retryDelay: number;
}
export interface EsError {
@ -44,12 +47,14 @@ class EsContextImpl implements EsContext {
public esAdapter: IClusterClientAdapter;
private readonly readySignal: ReadySignal<boolean>;
public initialized: boolean;
public readonly retryDelay: number;
constructor(params: EsContextCtorParams) {
this.logger = params.logger;
this.esNames = getEsNames(params.indexNameRoot, params.kibanaVersion);
this.readySignal = createReadySignal();
this.initialized = false;
this.retryDelay = RETRY_DELAY;
this.esAdapter = new ClusterClientAdapter({
logger: params.logger,
elasticsearchClientPromise: params.elasticsearchClientPromise,

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import { contextMock } from './context.mock';
import { contextMock, MOCK_RETRY_DELAY } from './context.mock';
import { initializeEs, parseIndexAliases } from './init';
describe('initializeEs', () => {
@ -454,3 +454,91 @@ describe('parseIndexAliases', () => {
]);
});
});
describe('retries', () => {
let esContext = contextMock.create();
// set up context APIs to return defaults indicating already created
beforeEach(() => {
esContext = contextMock.create();
esContext.esAdapter.getExistingLegacyIndexTemplates.mockResolvedValue({});
esContext.esAdapter.getExistingIndices.mockResolvedValue({});
esContext.esAdapter.getExistingIndexAliases.mockResolvedValue({});
esContext.esAdapter.doesIlmPolicyExist.mockResolvedValue(true);
esContext.esAdapter.doesIndexTemplateExist.mockResolvedValue(true);
esContext.esAdapter.doesAliasExist.mockResolvedValue(true);
});
test('createIlmPolicyIfNotExists with 1 retry', async () => {
esContext.esAdapter.doesIlmPolicyExist.mockRejectedValueOnce(new Error('retry 1'));
const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY);
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(2);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);
const prefix = `eventLog initialization operation failed and will be retried: createIlmPolicyIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(1);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 1`);
});
test('createIndexTemplateIfNotExists with 2 retries', async () => {
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2a'));
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2b'));
const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2));
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(3);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);
const prefix = `eventLog initialization operation failed and will be retried: createIndexTemplateIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(2);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 2a`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 2b`);
});
test('createInitialIndexIfNotExists', async () => {
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5a'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5b'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5c'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5d'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5e'));
// make sure it only tries 5 times - this one should not be reported
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5f'));
const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2 + 4 + 8));
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(5);
const prefix = `eventLog initialization operation failed and will be retried: createInitialIndexIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(5);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 5a`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 5b`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 2 more times; error: retry 5c`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 1 more times; error: retry 5d`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 0 more times; error: retry 5e`);
expect(esContext.logger.error).toHaveBeenCalledTimes(1);
expect(esContext.logger.error).toHaveBeenCalledWith(
`error initializing elasticsearch resources: retry 5e`
);
});
});

View file

@ -8,9 +8,12 @@
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { asyncForEach } from '@kbn/std';
import { groupBy } from 'lodash';
import pRetry, { FailedAttemptError } from 'p-retry';
import { getIlmPolicy, getIndexTemplate } from './documents';
import { EsContext } from './context';
const MAX_RETRY_DELAY = 30000;
export async function initializeEs(esContext: EsContext): Promise<boolean> {
esContext.logger.debug('initializing elasticsearch resources starting');
@ -28,10 +31,33 @@ export async function initializeEs(esContext: EsContext): Promise<boolean> {
async function initializeEsResources(esContext: EsContext) {
const steps = new EsInitializationSteps(esContext);
await steps.setExistingAssetsToHidden();
await steps.createIlmPolicyIfNotExists();
await steps.createIndexTemplateIfNotExists();
await steps.createInitialIndexIfNotExists();
// today, setExistingAssetsToHidden() never throws, but just in case ...
await retry(steps.setExistingAssetsToHidden);
await retry(steps.createIlmPolicyIfNotExists);
await retry(steps.createIndexTemplateIfNotExists);
await retry(steps.createInitialIndexIfNotExists);
async function retry(stepMethod: () => Promise<void>): Promise<void> {
// call the step method with retry options via p-retry
await pRetry(() => stepMethod.call(steps), getRetryOptions(esContext, stepMethod.name));
}
}
function getRetryOptions(esContext: EsContext, operation: string) {
const logger = esContext.logger;
// should retry on the order of 2s, 4s, 8s, 16s
// see: https://github.com/tim-kos/node-retry#retryoperationoptions
return {
minTimeout: esContext.retryDelay,
maxTimeout: MAX_RETRY_DELAY,
retries: 4,
factor: 2,
randomize: true,
onFailedAttempt: (err: FailedAttemptError) => {
const message = `eventLog initialization operation failed and will be retried: ${operation}; ${err.retriesLeft} more times; error: ${err.message}`;
logger.warn(message);
},
};
}
export interface ParsedIndexAlias extends estypes.IndicesAliasDefinition {