mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
# Backport This will backport the following commits from `main` to `8.7`: - [[Fleet] Fixing update tags status reporting complete too early (#151330)](https://github.com/elastic/kibana/pull/151330) <!--- Backport version: 8.9.7 --> ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport) <!--BACKPORT [{"author":{"name":"Julia Bardi","email":"90178898+juliaElastic@users.noreply.github.com"},"sourceCommit":{"committedDate":"2023-02-21T09:30:42Z","message":"[Fleet] Fixing update tags status reporting complete too early (#151330)\n\n## Summary\r\n\r\nFixes https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in update tags, that when writing out action results, the\r\ncount was not correct in case of conflicts (less agents were updated\r\nthan agentIds).\r\nThis resulted in action result count reaching the total early, and\r\nmarking the action status complete.\r\n\r\nThis caused a failure in performance tests, because the action was set\r\nto complete, but actually not all agents had the new tag yet.\r\n\r\nWith the fix on action results I see the ack count correct, and the\r\naction moving to COMPLETE when all retries have finished.\r\n\r\nAdded another fix for `nbAgentsActionCreated` incorrectly showing higher\r\ncount than expected. This value comes from the sum of agentIds in the\r\n`.fleet-actions` doc. Changed the implementation so that\r\n`.fleet-actions` is updated on every retry to only record agents\r\nupdated, failed, and on last retry conflicted. This ensures that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with 50k agents, see recording here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n### Checklist\r\n\r\n- [x] [Unit or functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere updated or added to match the most common scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb","branchLabelMapping":{"^v8.8.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:skip","Team:Fleet","backport:prev-minor","ci:cloud-deploy","v8.8.0"],"number":151330,"url":"https://github.com/elastic/kibana/pull/151330","mergeCommit":{"message":"[Fleet] Fixing update tags status reporting complete too early (#151330)\n\n## Summary\r\n\r\nFixes https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in update tags, that when writing out action results, the\r\ncount was not correct in case of conflicts (less agents were updated\r\nthan agentIds).\r\nThis resulted in action result count reaching the total early, and\r\nmarking the action status complete.\r\n\r\nThis caused a failure in performance tests, because the action was set\r\nto complete, but actually not all agents had the new tag yet.\r\n\r\nWith the fix on action results I see the ack count correct, and the\r\naction moving to COMPLETE when all retries have finished.\r\n\r\nAdded another fix for `nbAgentsActionCreated` incorrectly showing higher\r\ncount than expected. This value comes from the sum of agentIds in the\r\n`.fleet-actions` doc. Changed the implementation so that\r\n`.fleet-actions` is updated on every retry to only record agents\r\nupdated, failed, and on last retry conflicted. This ensures that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with 50k agents, see recording here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n### Checklist\r\n\r\n- [x] [Unit or functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere updated or added to match the most common scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v8.8.0","labelRegex":"^v8.8.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/151330","number":151330,"mergeCommit":{"message":"[Fleet] Fixing update tags status reporting complete too early (#151330)\n\n## Summary\r\n\r\nFixes https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in update tags, that when writing out action results, the\r\ncount was not correct in case of conflicts (less agents were updated\r\nthan agentIds).\r\nThis resulted in action result count reaching the total early, and\r\nmarking the action status complete.\r\n\r\nThis caused a failure in performance tests, because the action was set\r\nto complete, but actually not all agents had the new tag yet.\r\n\r\nWith the fix on action results I see the ack count correct, and the\r\naction moving to COMPLETE when all retries have finished.\r\n\r\nAdded another fix for `nbAgentsActionCreated` incorrectly showing higher\r\ncount than expected. This value comes from the sum of agentIds in the\r\n`.fleet-actions` doc. Changed the implementation so that\r\n`.fleet-actions` is updated on every retry to only record agents\r\nupdated, failed, and on last retry conflicted. This ensures that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with 50k agents, see recording here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n### Checklist\r\n\r\n- [x] [Unit or functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere updated or added to match the most common scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb"}}]}] BACKPORT--> Co-authored-by: Julia Bardi <90178898+juliaElastic@users.noreply.github.com>
This commit is contained in:
parent
0899980f7e
commit
4fcef5cd8f
2 changed files with 94 additions and 24 deletions
|
@ -110,7 +110,7 @@ describe('update_agent_tags', () => {
|
|||
expect(agentAction?.body).toEqual(
|
||||
expect.objectContaining({
|
||||
action_id: expect.anything(),
|
||||
agents: ['agent1'],
|
||||
agents: [expect.any(String)],
|
||||
type: 'UPDATE_TAGS',
|
||||
total: 1,
|
||||
})
|
||||
|
@ -120,7 +120,7 @@ describe('update_agent_tags', () => {
|
|||
const agentIds = actionResults?.body
|
||||
?.filter((i: any) => i.agent_id)
|
||||
.map((i: any) => i.agent_id);
|
||||
expect(agentIds).toEqual(['agent1']);
|
||||
expect(agentIds.length).toEqual(1);
|
||||
expect(actionResults.body[1].error).not.toBeDefined();
|
||||
});
|
||||
|
||||
|
@ -142,7 +142,7 @@ describe('update_agent_tags', () => {
|
|||
expect(agentAction?.body).toEqual(
|
||||
expect.objectContaining({
|
||||
action_id: expect.anything(),
|
||||
agents: [agentInRegularDoc._id],
|
||||
agents: [expect.any(String)],
|
||||
type: 'UPDATE_TAGS',
|
||||
total: 1,
|
||||
})
|
||||
|
@ -152,12 +152,23 @@ describe('update_agent_tags', () => {
|
|||
it('should write error action results when failures are returned', async () => {
|
||||
esClient.updateByQuery.mockReset();
|
||||
esClient.updateByQuery.mockResolvedValue({
|
||||
failures: [{ cause: { reason: 'error reason' } }],
|
||||
failures: [{ id: 'failure1', cause: { reason: 'error reason' } }],
|
||||
updated: 0,
|
||||
total: 1,
|
||||
} as any);
|
||||
|
||||
await updateAgentTags(soClient, esClient, { agentIds: ['agent1'] }, ['one'], []);
|
||||
|
||||
const agentAction = esClient.create.mock.calls[0][0] as any;
|
||||
expect(agentAction?.body).toEqual(
|
||||
expect.objectContaining({
|
||||
action_id: expect.anything(),
|
||||
agents: ['failure1'],
|
||||
type: 'UPDATE_TAGS',
|
||||
total: 1,
|
||||
})
|
||||
);
|
||||
|
||||
const errorResults = esClient.bulk.mock.calls[0][0] as any;
|
||||
expect(errorResults.body[1].error).toEqual('error reason');
|
||||
});
|
||||
|
@ -181,6 +192,7 @@ describe('update_agent_tags', () => {
|
|||
failures: [],
|
||||
updated: 0,
|
||||
version_conflicts: 100,
|
||||
total: 100,
|
||||
} as any);
|
||||
|
||||
await expect(
|
||||
|
@ -198,10 +210,43 @@ describe('update_agent_tags', () => {
|
|||
}
|
||||
)
|
||||
).rejects.toThrowError('version conflict of 100 agents');
|
||||
|
||||
const agentAction = esClient.create.mock.calls[0][0] as any;
|
||||
expect(agentAction?.body.agents.length).toEqual(100);
|
||||
|
||||
const errorResults = esClient.bulk.mock.calls[0][0] as any;
|
||||
expect(errorResults.body[1].error).toEqual('version conflict on last retry');
|
||||
});
|
||||
|
||||
it('should combine action agents from updated, failures and version conflicts on last retry', async () => {
|
||||
esClient.updateByQuery.mockReset();
|
||||
esClient.updateByQuery.mockResolvedValue({
|
||||
failures: [{ id: 'failure1', cause: { reason: 'error reason' } }],
|
||||
updated: 1,
|
||||
version_conflicts: 1,
|
||||
total: 3,
|
||||
} as any);
|
||||
|
||||
await expect(
|
||||
updateTagsBatch(
|
||||
soClient,
|
||||
esClient,
|
||||
[{ id: 'agent1' } as Agent],
|
||||
{},
|
||||
{
|
||||
tagsToAdd: ['new'],
|
||||
tagsToRemove: [],
|
||||
kuery: '',
|
||||
total: 3,
|
||||
retryCount: MAX_RETRY_COUNT,
|
||||
}
|
||||
)
|
||||
).rejects.toThrowError('version conflict of 1 agents');
|
||||
|
||||
const agentAction = esClient.create.mock.calls[0][0] as any;
|
||||
expect(agentAction?.body.agents.length).toEqual(3);
|
||||
});
|
||||
|
||||
it('should run add tags async when actioning more agents than batch size', async () => {
|
||||
esClient.search.mockResolvedValue({
|
||||
hits: {
|
||||
|
@ -301,7 +346,7 @@ describe('update_agent_tags', () => {
|
|||
|
||||
it('should write total from total param if updateByQuery returns less results', async () => {
|
||||
esClient.updateByQuery.mockReset();
|
||||
esClient.updateByQuery.mockResolvedValue({ failures: [], updated: 0, total: 50 } as any);
|
||||
esClient.updateByQuery.mockResolvedValue({ failures: [], updated: 1, total: 50 } as any);
|
||||
|
||||
await updateTagsBatch(
|
||||
soClient,
|
||||
|
@ -320,7 +365,7 @@ describe('update_agent_tags', () => {
|
|||
expect(agentAction?.body).toEqual(
|
||||
expect.objectContaining({
|
||||
action_id: expect.anything(),
|
||||
agents: ['agent1'],
|
||||
agents: [expect.any(String)],
|
||||
type: 'UPDATE_TAGS',
|
||||
total: 100,
|
||||
})
|
||||
|
|
|
@ -129,56 +129,81 @@ export async function updateTagsBatch(
|
|||
|
||||
appContextService.getLogger().debug(JSON.stringify(res).slice(0, 1000));
|
||||
|
||||
if (options.retryCount === undefined) {
|
||||
// creating an action doc so that update tags shows up in activity
|
||||
await createAgentAction(esClient, {
|
||||
id: actionId,
|
||||
agents: agentIds,
|
||||
created_at: new Date().toISOString(),
|
||||
type: 'UPDATE_TAGS',
|
||||
total: options.total ?? res.total,
|
||||
});
|
||||
}
|
||||
|
||||
// creating unique ids to use as agentId, as we don't have all agent ids in case of action by kuery
|
||||
const getUuidArray = (count: number) => Array.from({ length: count }, () => uuidv4());
|
||||
|
||||
const updatedCount = res.updated ?? 0;
|
||||
const updatedIds = getUuidArray(updatedCount);
|
||||
|
||||
const failures = res.failures ?? [];
|
||||
const failureCount = failures.length;
|
||||
|
||||
const isLastRetry = options.retryCount === MAX_RETRY_COUNT;
|
||||
|
||||
const versionConflictCount = res.version_conflicts ?? 0;
|
||||
const versionConflictIds = isLastRetry ? getUuidArray(versionConflictCount) : [];
|
||||
|
||||
// creating an action doc so that update tags shows up in activity
|
||||
// the logic only saves agent count in the action that updated, failed or in case of last retry, conflicted
|
||||
// this ensures that the action status count will be accurate
|
||||
await createAgentAction(esClient, {
|
||||
id: actionId,
|
||||
agents: updatedIds
|
||||
.concat(failures.map((failure) => failure.id))
|
||||
.concat(isLastRetry ? versionConflictIds : []),
|
||||
created_at: new Date().toISOString(),
|
||||
type: 'UPDATE_TAGS',
|
||||
total: options.total ?? res.total,
|
||||
});
|
||||
appContextService
|
||||
.getLogger()
|
||||
.debug(
|
||||
`action doc wrote on ${
|
||||
updatedCount + failureCount + (isLastRetry ? versionConflictCount : 0)
|
||||
} agentIds, updated: ${updatedCount}, failed: ${failureCount}, version_conflicts: ${versionConflictCount}`
|
||||
);
|
||||
|
||||
// writing successful action results
|
||||
if (res.updated ?? 0 > 0) {
|
||||
if (updatedCount > 0) {
|
||||
await bulkCreateAgentActionResults(
|
||||
esClient,
|
||||
agentIds.map((id) => ({
|
||||
updatedIds.map((id) => ({
|
||||
agentId: id,
|
||||
actionId,
|
||||
}))
|
||||
);
|
||||
appContextService.getLogger().debug(`action updated result wrote on ${updatedCount} agents`);
|
||||
}
|
||||
|
||||
// writing failures from es update
|
||||
if (res.failures && res.failures.length > 0) {
|
||||
if (failures.length > 0) {
|
||||
await bulkCreateAgentActionResults(
|
||||
esClient,
|
||||
res.failures.map((failure) => ({
|
||||
failures.map((failure) => ({
|
||||
agentId: failure.id,
|
||||
actionId,
|
||||
error: failure.cause.reason,
|
||||
}))
|
||||
);
|
||||
appContextService.getLogger().debug(`action failed result wrote on ${failureCount} agents`);
|
||||
}
|
||||
|
||||
if (res.version_conflicts ?? 0 > 0) {
|
||||
if (versionConflictCount > 0) {
|
||||
// write out error results on last retry, so action is not stuck in progress
|
||||
if (options.retryCount === MAX_RETRY_COUNT) {
|
||||
await bulkCreateAgentActionResults(
|
||||
esClient,
|
||||
getUuidArray(res.version_conflicts!).map((id) => ({
|
||||
versionConflictIds.map((id) => ({
|
||||
agentId: id,
|
||||
actionId,
|
||||
error: 'version conflict on last retry',
|
||||
}))
|
||||
);
|
||||
appContextService
|
||||
.getLogger()
|
||||
.debug(`action conflict result wrote on ${versionConflictCount} agents`);
|
||||
}
|
||||
throw new Error(`version conflict of ${res.version_conflicts} agents`);
|
||||
throw new Error(`version conflict of ${versionConflictCount} agents`);
|
||||
}
|
||||
|
||||
return { actionId, updated: res.updated, took: res.took };
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue