[8.7] [Fleet] Fixing update tags status reporting complete too early (#151330) (#151675)

# Backport

This will backport the following commits from `main` to `8.7`:
- [[Fleet] Fixing update tags status reporting complete too early
(#151330)](https://github.com/elastic/kibana/pull/151330)

<!--- Backport version: 8.9.7 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Julia
Bardi","email":"90178898+juliaElastic@users.noreply.github.com"},"sourceCommit":{"committedDate":"2023-02-21T09:30:42Z","message":"[Fleet]
Fixing update tags status reporting complete too early (#151330)\n\n##
Summary\r\n\r\nFixes
https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in
update tags, that when writing out action results, the\r\ncount was not
correct in case of conflicts (less agents were updated\r\nthan
agentIds).\r\nThis resulted in action result count reaching the total
early, and\r\nmarking the action status complete.\r\n\r\nThis caused a
failure in performance tests, because the action was set\r\nto complete,
but actually not all agents had the new tag yet.\r\n\r\nWith the fix on
action results I see the ack count correct, and the\r\naction moving to
COMPLETE when all retries have finished.\r\n\r\nAdded another fix for
`nbAgentsActionCreated` incorrectly showing higher\r\ncount than
expected. This value comes from the sum of agentIds in
the\r\n`.fleet-actions` doc. Changed the implementation so
that\r\n`.fleet-actions` is updated on every retry to only record
agents\r\nupdated, failed, and on last retry conflicted. This ensures
that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with
50k agents, see recording
here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n###
Checklist\r\n\r\n- [x] [Unit or
functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere
updated or added to match the most common
scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb","branchLabelMapping":{"^v8.8.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:skip","Team:Fleet","backport:prev-minor","ci:cloud-deploy","v8.8.0"],"number":151330,"url":"https://github.com/elastic/kibana/pull/151330","mergeCommit":{"message":"[Fleet]
Fixing update tags status reporting complete too early (#151330)\n\n##
Summary\r\n\r\nFixes
https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in
update tags, that when writing out action results, the\r\ncount was not
correct in case of conflicts (less agents were updated\r\nthan
agentIds).\r\nThis resulted in action result count reaching the total
early, and\r\nmarking the action status complete.\r\n\r\nThis caused a
failure in performance tests, because the action was set\r\nto complete,
but actually not all agents had the new tag yet.\r\n\r\nWith the fix on
action results I see the ack count correct, and the\r\naction moving to
COMPLETE when all retries have finished.\r\n\r\nAdded another fix for
`nbAgentsActionCreated` incorrectly showing higher\r\ncount than
expected. This value comes from the sum of agentIds in
the\r\n`.fleet-actions` doc. Changed the implementation so
that\r\n`.fleet-actions` is updated on every retry to only record
agents\r\nupdated, failed, and on last retry conflicted. This ensures
that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with
50k agents, see recording
here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n###
Checklist\r\n\r\n- [x] [Unit or
functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere
updated or added to match the most common
scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v8.8.0","labelRegex":"^v8.8.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/151330","number":151330,"mergeCommit":{"message":"[Fleet]
Fixing update tags status reporting complete too early (#151330)\n\n##
Summary\r\n\r\nFixes
https://github.com/elastic/kibana/issues/150654\r\n\r\nFound a bug in
update tags, that when writing out action results, the\r\ncount was not
correct in case of conflicts (less agents were updated\r\nthan
agentIds).\r\nThis resulted in action result count reaching the total
early, and\r\nmarking the action status complete.\r\n\r\nThis caused a
failure in performance tests, because the action was set\r\nto complete,
but actually not all agents had the new tag yet.\r\n\r\nWith the fix on
action results I see the ack count correct, and the\r\naction moving to
COMPLETE when all retries have finished.\r\n\r\nAdded another fix for
`nbAgentsActionCreated` incorrectly showing higher\r\ncount than
expected. This value comes from the sum of agentIds in
the\r\n`.fleet-actions` doc. Changed the implementation so
that\r\n`.fleet-actions` is updated on every retry to only record
agents\r\nupdated, failed, and on last retry conflicted. This ensures
that the\r\n`nbAgentsActionCreated` will be accurate.\r\n\r\nTested with
50k agents, see recording
here\r\nhttps://github.com/elastic/kibana/pull/151330#discussion_r1109526053\r\n\r\n###
Checklist\r\n\r\n- [x] [Unit or
functional\r\ntests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)\r\nwere
updated or added to match the most common
scenarios","sha":"b163cb2d8512ca331c4d99288964d37d2a16fbfb"}}]}]
BACKPORT-->

Co-authored-by: Julia Bardi <90178898+juliaElastic@users.noreply.github.com>
This commit is contained in:
Kibana Machine 2023-02-21 05:39:06 -05:00 committed by GitHub
parent 0899980f7e
commit 4fcef5cd8f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 94 additions and 24 deletions

View file

@ -110,7 +110,7 @@ describe('update_agent_tags', () => {
expect(agentAction?.body).toEqual(
expect.objectContaining({
action_id: expect.anything(),
agents: ['agent1'],
agents: [expect.any(String)],
type: 'UPDATE_TAGS',
total: 1,
})
@ -120,7 +120,7 @@ describe('update_agent_tags', () => {
const agentIds = actionResults?.body
?.filter((i: any) => i.agent_id)
.map((i: any) => i.agent_id);
expect(agentIds).toEqual(['agent1']);
expect(agentIds.length).toEqual(1);
expect(actionResults.body[1].error).not.toBeDefined();
});
@ -142,7 +142,7 @@ describe('update_agent_tags', () => {
expect(agentAction?.body).toEqual(
expect.objectContaining({
action_id: expect.anything(),
agents: [agentInRegularDoc._id],
agents: [expect.any(String)],
type: 'UPDATE_TAGS',
total: 1,
})
@ -152,12 +152,23 @@ describe('update_agent_tags', () => {
it('should write error action results when failures are returned', async () => {
esClient.updateByQuery.mockReset();
esClient.updateByQuery.mockResolvedValue({
failures: [{ cause: { reason: 'error reason' } }],
failures: [{ id: 'failure1', cause: { reason: 'error reason' } }],
updated: 0,
total: 1,
} as any);
await updateAgentTags(soClient, esClient, { agentIds: ['agent1'] }, ['one'], []);
const agentAction = esClient.create.mock.calls[0][0] as any;
expect(agentAction?.body).toEqual(
expect.objectContaining({
action_id: expect.anything(),
agents: ['failure1'],
type: 'UPDATE_TAGS',
total: 1,
})
);
const errorResults = esClient.bulk.mock.calls[0][0] as any;
expect(errorResults.body[1].error).toEqual('error reason');
});
@ -181,6 +192,7 @@ describe('update_agent_tags', () => {
failures: [],
updated: 0,
version_conflicts: 100,
total: 100,
} as any);
await expect(
@ -198,10 +210,43 @@ describe('update_agent_tags', () => {
}
)
).rejects.toThrowError('version conflict of 100 agents');
const agentAction = esClient.create.mock.calls[0][0] as any;
expect(agentAction?.body.agents.length).toEqual(100);
const errorResults = esClient.bulk.mock.calls[0][0] as any;
expect(errorResults.body[1].error).toEqual('version conflict on last retry');
});
it('should combine action agents from updated, failures and version conflicts on last retry', async () => {
esClient.updateByQuery.mockReset();
esClient.updateByQuery.mockResolvedValue({
failures: [{ id: 'failure1', cause: { reason: 'error reason' } }],
updated: 1,
version_conflicts: 1,
total: 3,
} as any);
await expect(
updateTagsBatch(
soClient,
esClient,
[{ id: 'agent1' } as Agent],
{},
{
tagsToAdd: ['new'],
tagsToRemove: [],
kuery: '',
total: 3,
retryCount: MAX_RETRY_COUNT,
}
)
).rejects.toThrowError('version conflict of 1 agents');
const agentAction = esClient.create.mock.calls[0][0] as any;
expect(agentAction?.body.agents.length).toEqual(3);
});
it('should run add tags async when actioning more agents than batch size', async () => {
esClient.search.mockResolvedValue({
hits: {
@ -301,7 +346,7 @@ describe('update_agent_tags', () => {
it('should write total from total param if updateByQuery returns less results', async () => {
esClient.updateByQuery.mockReset();
esClient.updateByQuery.mockResolvedValue({ failures: [], updated: 0, total: 50 } as any);
esClient.updateByQuery.mockResolvedValue({ failures: [], updated: 1, total: 50 } as any);
await updateTagsBatch(
soClient,
@ -320,7 +365,7 @@ describe('update_agent_tags', () => {
expect(agentAction?.body).toEqual(
expect.objectContaining({
action_id: expect.anything(),
agents: ['agent1'],
agents: [expect.any(String)],
type: 'UPDATE_TAGS',
total: 100,
})

View file

@ -129,56 +129,81 @@ export async function updateTagsBatch(
appContextService.getLogger().debug(JSON.stringify(res).slice(0, 1000));
if (options.retryCount === undefined) {
// creating an action doc so that update tags shows up in activity
await createAgentAction(esClient, {
id: actionId,
agents: agentIds,
created_at: new Date().toISOString(),
type: 'UPDATE_TAGS',
total: options.total ?? res.total,
});
}
// creating unique ids to use as agentId, as we don't have all agent ids in case of action by kuery
const getUuidArray = (count: number) => Array.from({ length: count }, () => uuidv4());
const updatedCount = res.updated ?? 0;
const updatedIds = getUuidArray(updatedCount);
const failures = res.failures ?? [];
const failureCount = failures.length;
const isLastRetry = options.retryCount === MAX_RETRY_COUNT;
const versionConflictCount = res.version_conflicts ?? 0;
const versionConflictIds = isLastRetry ? getUuidArray(versionConflictCount) : [];
// creating an action doc so that update tags shows up in activity
// the logic only saves agent count in the action that updated, failed or in case of last retry, conflicted
// this ensures that the action status count will be accurate
await createAgentAction(esClient, {
id: actionId,
agents: updatedIds
.concat(failures.map((failure) => failure.id))
.concat(isLastRetry ? versionConflictIds : []),
created_at: new Date().toISOString(),
type: 'UPDATE_TAGS',
total: options.total ?? res.total,
});
appContextService
.getLogger()
.debug(
`action doc wrote on ${
updatedCount + failureCount + (isLastRetry ? versionConflictCount : 0)
} agentIds, updated: ${updatedCount}, failed: ${failureCount}, version_conflicts: ${versionConflictCount}`
);
// writing successful action results
if (res.updated ?? 0 > 0) {
if (updatedCount > 0) {
await bulkCreateAgentActionResults(
esClient,
agentIds.map((id) => ({
updatedIds.map((id) => ({
agentId: id,
actionId,
}))
);
appContextService.getLogger().debug(`action updated result wrote on ${updatedCount} agents`);
}
// writing failures from es update
if (res.failures && res.failures.length > 0) {
if (failures.length > 0) {
await bulkCreateAgentActionResults(
esClient,
res.failures.map((failure) => ({
failures.map((failure) => ({
agentId: failure.id,
actionId,
error: failure.cause.reason,
}))
);
appContextService.getLogger().debug(`action failed result wrote on ${failureCount} agents`);
}
if (res.version_conflicts ?? 0 > 0) {
if (versionConflictCount > 0) {
// write out error results on last retry, so action is not stuck in progress
if (options.retryCount === MAX_RETRY_COUNT) {
await bulkCreateAgentActionResults(
esClient,
getUuidArray(res.version_conflicts!).map((id) => ({
versionConflictIds.map((id) => ({
agentId: id,
actionId,
error: 'version conflict on last retry',
}))
);
appContextService
.getLogger()
.debug(`action conflict result wrote on ${versionConflictCount} agents`);
}
throw new Error(`version conflict of ${res.version_conflicts} agents`);
throw new Error(`version conflict of ${versionConflictCount} agents`);
}
return { actionId, updated: res.updated, took: res.took };