mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 09:48:58 -04:00
resolves https://github.com/elastic/kibana/issues/134098
Adds retry logic to the initialization of elasticsearch
resources, when Kibana starts up. Recently, it seems
this has become a more noticeable error - that race
conditions occur where two Kibana's initializing a new
stack version will race to create the event log resources.
We believe we'll see the end of these issues with some
retries, chunked around the 4 resource-y sections of
the initialization code.
We're using [p-retry][] (which uses [retry][]), to do an
exponential backoff starting at 2s, then 4s, 8s, 16s,
with 4 retries (so 5 actual attempted calls). Some
randomness is added, since there's a race on.
[p-retry]: https://github.com/sindresorhus/p-retry#p-retry
[retry]: https://github.com/tim-kos/node-retry#retry
Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
(cherry picked from commit f6e4c2f806
)
Co-authored-by: Patrick Mueller <patrick.mueller@elastic.co>
This commit is contained in:
parent
6e0619fcf7
commit
5f681db9ea
4 changed files with 131 additions and 9 deletions
|
@ -12,6 +12,8 @@ import { namesMock } from './names.mock';
|
|||
import { IClusterClientAdapter } from './cluster_client_adapter';
|
||||
import { clusterClientAdapterMock } from './cluster_client_adapter.mock';
|
||||
|
||||
export const MOCK_RETRY_DELAY = 20;
|
||||
|
||||
const createContextMock = () => {
|
||||
const mock: jest.Mocked<EsContext> & {
|
||||
esAdapter: jest.Mocked<IClusterClientAdapter>;
|
||||
|
@ -23,6 +25,7 @@ const createContextMock = () => {
|
|||
waitTillReady: jest.fn(async () => true),
|
||||
esAdapter: clusterClientAdapterMock.create(),
|
||||
initialized: true,
|
||||
retryDelay: MOCK_RETRY_DELAY,
|
||||
};
|
||||
return mock;
|
||||
};
|
||||
|
|
|
@ -12,14 +12,17 @@ import { initializeEs } from './init';
|
|||
import { ClusterClientAdapter, IClusterClientAdapter } from './cluster_client_adapter';
|
||||
import { createReadySignal, ReadySignal } from '../lib/ready_signal';
|
||||
|
||||
export const RETRY_DELAY = 2000;
|
||||
|
||||
export interface EsContext {
|
||||
logger: Logger;
|
||||
esNames: EsNames;
|
||||
esAdapter: IClusterClientAdapter;
|
||||
readonly logger: Logger;
|
||||
readonly esNames: EsNames;
|
||||
readonly esAdapter: IClusterClientAdapter;
|
||||
initialize(): void;
|
||||
shutdown(): Promise<void>;
|
||||
waitTillReady(): Promise<boolean>;
|
||||
initialized: boolean;
|
||||
readonly initialized: boolean;
|
||||
readonly retryDelay: number;
|
||||
}
|
||||
|
||||
export interface EsError {
|
||||
|
@ -44,12 +47,14 @@ class EsContextImpl implements EsContext {
|
|||
public esAdapter: IClusterClientAdapter;
|
||||
private readonly readySignal: ReadySignal<boolean>;
|
||||
public initialized: boolean;
|
||||
public readonly retryDelay: number;
|
||||
|
||||
constructor(params: EsContextCtorParams) {
|
||||
this.logger = params.logger;
|
||||
this.esNames = getEsNames(params.indexNameRoot, params.kibanaVersion);
|
||||
this.readySignal = createReadySignal();
|
||||
this.initialized = false;
|
||||
this.retryDelay = RETRY_DELAY;
|
||||
this.esAdapter = new ClusterClientAdapter({
|
||||
logger: params.logger,
|
||||
elasticsearchClientPromise: params.elasticsearchClientPromise,
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { contextMock } from './context.mock';
|
||||
import { contextMock, MOCK_RETRY_DELAY } from './context.mock';
|
||||
import { initializeEs, parseIndexAliases } from './init';
|
||||
|
||||
describe('initializeEs', () => {
|
||||
|
@ -454,3 +454,91 @@ describe('parseIndexAliases', () => {
|
|||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('retries', () => {
|
||||
let esContext = contextMock.create();
|
||||
// set up context APIs to return defaults indicating already created
|
||||
beforeEach(() => {
|
||||
esContext = contextMock.create();
|
||||
esContext.esAdapter.getExistingLegacyIndexTemplates.mockResolvedValue({});
|
||||
esContext.esAdapter.getExistingIndices.mockResolvedValue({});
|
||||
esContext.esAdapter.getExistingIndexAliases.mockResolvedValue({});
|
||||
esContext.esAdapter.doesIlmPolicyExist.mockResolvedValue(true);
|
||||
esContext.esAdapter.doesIndexTemplateExist.mockResolvedValue(true);
|
||||
esContext.esAdapter.doesAliasExist.mockResolvedValue(true);
|
||||
});
|
||||
|
||||
test('createIlmPolicyIfNotExists with 1 retry', async () => {
|
||||
esContext.esAdapter.doesIlmPolicyExist.mockRejectedValueOnce(new Error('retry 1'));
|
||||
|
||||
const timeStart = Date.now();
|
||||
await initializeEs(esContext);
|
||||
const timeElapsed = Date.now() - timeStart;
|
||||
|
||||
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY);
|
||||
|
||||
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(2);
|
||||
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);
|
||||
|
||||
const prefix = `eventLog initialization operation failed and will be retried: createIlmPolicyIfNotExists`;
|
||||
expect(esContext.logger.warn).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 1`);
|
||||
});
|
||||
|
||||
test('createIndexTemplateIfNotExists with 2 retries', async () => {
|
||||
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2a'));
|
||||
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2b'));
|
||||
|
||||
const timeStart = Date.now();
|
||||
await initializeEs(esContext);
|
||||
const timeElapsed = Date.now() - timeStart;
|
||||
|
||||
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2));
|
||||
|
||||
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(3);
|
||||
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);
|
||||
|
||||
const prefix = `eventLog initialization operation failed and will be retried: createIndexTemplateIfNotExists`;
|
||||
expect(esContext.logger.warn).toHaveBeenCalledTimes(2);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 2a`);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 2b`);
|
||||
});
|
||||
|
||||
test('createInitialIndexIfNotExists', async () => {
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5a'));
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5b'));
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5c'));
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5d'));
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5e'));
|
||||
// make sure it only tries 5 times - this one should not be reported
|
||||
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5f'));
|
||||
|
||||
const timeStart = Date.now();
|
||||
await initializeEs(esContext);
|
||||
const timeElapsed = Date.now() - timeStart;
|
||||
|
||||
expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2 + 4 + 8));
|
||||
|
||||
expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(5);
|
||||
|
||||
const prefix = `eventLog initialization operation failed and will be retried: createInitialIndexIfNotExists`;
|
||||
expect(esContext.logger.warn).toHaveBeenCalledTimes(5);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 5a`);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 5b`);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 2 more times; error: retry 5c`);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 1 more times; error: retry 5d`);
|
||||
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 0 more times; error: retry 5e`);
|
||||
|
||||
expect(esContext.logger.error).toHaveBeenCalledTimes(1);
|
||||
expect(esContext.logger.error).toHaveBeenCalledWith(
|
||||
`error initializing elasticsearch resources: retry 5e`
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -8,9 +8,12 @@
|
|||
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { asyncForEach } from '@kbn/std';
|
||||
import { groupBy } from 'lodash';
|
||||
import pRetry, { FailedAttemptError } from 'p-retry';
|
||||
import { getIlmPolicy, getIndexTemplate } from './documents';
|
||||
import { EsContext } from './context';
|
||||
|
||||
const MAX_RETRY_DELAY = 30000;
|
||||
|
||||
export async function initializeEs(esContext: EsContext): Promise<boolean> {
|
||||
esContext.logger.debug('initializing elasticsearch resources starting');
|
||||
|
||||
|
@ -28,10 +31,33 @@ export async function initializeEs(esContext: EsContext): Promise<boolean> {
|
|||
async function initializeEsResources(esContext: EsContext) {
|
||||
const steps = new EsInitializationSteps(esContext);
|
||||
|
||||
await steps.setExistingAssetsToHidden();
|
||||
await steps.createIlmPolicyIfNotExists();
|
||||
await steps.createIndexTemplateIfNotExists();
|
||||
await steps.createInitialIndexIfNotExists();
|
||||
// today, setExistingAssetsToHidden() never throws, but just in case ...
|
||||
await retry(steps.setExistingAssetsToHidden);
|
||||
await retry(steps.createIlmPolicyIfNotExists);
|
||||
await retry(steps.createIndexTemplateIfNotExists);
|
||||
await retry(steps.createInitialIndexIfNotExists);
|
||||
|
||||
async function retry(stepMethod: () => Promise<void>): Promise<void> {
|
||||
// call the step method with retry options via p-retry
|
||||
await pRetry(() => stepMethod.call(steps), getRetryOptions(esContext, stepMethod.name));
|
||||
}
|
||||
}
|
||||
|
||||
function getRetryOptions(esContext: EsContext, operation: string) {
|
||||
const logger = esContext.logger;
|
||||
// should retry on the order of 2s, 4s, 8s, 16s
|
||||
// see: https://github.com/tim-kos/node-retry#retryoperationoptions
|
||||
return {
|
||||
minTimeout: esContext.retryDelay,
|
||||
maxTimeout: MAX_RETRY_DELAY,
|
||||
retries: 4,
|
||||
factor: 2,
|
||||
randomize: true,
|
||||
onFailedAttempt: (err: FailedAttemptError) => {
|
||||
const message = `eventLog initialization operation failed and will be retried: ${operation}; ${err.retriesLeft} more times; error: ${err.message}`;
|
||||
logger.warn(message);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export interface ParsedIndexAlias extends estypes.IndicesAliasDefinition {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue