mirror of
https://github.com/elastic/kibana.git
synced 2025-04-23 09:19:04 -04:00
[7.12] Updated to allow chunked queries and to increase the timeouts of the REST backend (#94531) (#94607)
* Updated to allow chunked queries and to increase the timeouts of the REST backend (#94531) Increases the pre-packaged socket timeout and chunks the requests. Existing e2e tests should cover the changes. Interesting enough, when the server sends back a 408, Chrome will re-send the same request again which can cause socket/network saturations. By increasing the timeout, Chrome will not resend the same request again on timeout. Right now, there is not a way to increase the timeouts for the alerting framework/saved objects as far as I know for connections. That would be an additional safety measure in additional to doing chunked requests. Chunked requests will ensure that the pre-packaged rule does not exhaust ephemeral ports and limit the concurrent requests. See this issue talked about below: https://github.com/sindresorhus/ky/issues/233 https://groups.google.com/a/chromium.org/g/chromium-dev/c/urswDsm6Pe0 https://medium.com/@lighthopper/connection-retry-schedule-in-chrome-browser-a9c814b7dc20 **Manual testing** You can bump up the rule version numbers manually through a search and replace and then install them. You can add a `console.trace()` to the backend and slow down the requests to ensure they are not happening more than once. ``` Trace: at updatePrepackagedRules (/Users/frankhassanabad/projects/kibana/x-pack/plugins/security_solution/server/lib/detection_engine/rules/update_prepacked_rules.ts:34:11) at createPrepackagedRules (/Users/frankhassanabad/projects/kibana/x-pack/plugins/security_solution/server/lib/detection_engine/routes/rules/add_prepackaged_rules_route.ts:140:9) at runMicrotasks (<anonymous>) at processTicksAndRejections (internal/process/task_queues.js:93:5) at /Users/frankhassanabad/projects/kibana/x-pack/plugins/security_solution/server/lib/detection_engine/routes/rules/add_prepackaged_rules_route.ts:66:27 at Router.handle (/Users/frankhassanabad/projects/kibana/src/core/server/http/router/router.ts:272:30) at handler (/Users/frankhassanabad/projects/kibana/src/core/server/http/router/router.ts:227:11) at exports.Manager.execute (/Users/frankhassanabad/projects/kibana/node_modules/@hapi/hapi/lib/toolkit.js:60:28) at Object.internals.handler (/Users/frankhassanabad/projects/kibana/node_modules/@hapi/hapi/lib/handler.js:46:20) at exports.execute (/Users/frankhassanabad/projects/kibana/node_modules/@hapi/hapi/lib/handler.js:31:20) at Request._lifecycle (/Users/frankhassanabad/projects/kibana/node_modules/@hapi/hapi/lib/request.js:371:32) at Request._execute (/Users/frankhassanabad/projects/kibana/node_modules/@hapi/hapi/lib/request.js:279:9) ``` Delete any items that are not applicable to this PR. - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios * Wrong import as alerting is now called alerts. bad merge
This commit is contained in:
parent
d252005038
commit
ddf265f57f
2 changed files with 171 additions and 109 deletions
|
@ -5,6 +5,7 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import moment from 'moment';
|
||||
import type {
|
||||
AppClient,
|
||||
SecuritySolutionPluginRouter,
|
||||
|
@ -49,6 +50,13 @@ export const addPrepackedRulesRoute = (
|
|||
validate: false,
|
||||
options: {
|
||||
tags: ['access:securitySolution'],
|
||||
timeout: {
|
||||
// FUNFACT: If we do not add a very long timeout what will happen
|
||||
// is that Chrome which receive a 408 error and then do a retry.
|
||||
// This retry can cause lots of connections to happen. Using a very
|
||||
// long timeout will ensure that Chrome does not do retries and saturate the connections.
|
||||
idleSocket: moment.duration('1', 'hour').asMilliseconds(),
|
||||
},
|
||||
},
|
||||
},
|
||||
async (context, _, response) => {
|
||||
|
|
|
@ -6,123 +6,177 @@
|
|||
*/
|
||||
|
||||
import { SavedObjectsClientContract } from 'kibana/server';
|
||||
import { chunk } from 'lodash/fp';
|
||||
import { AddPrepackagedRulesSchemaDecoded } from '../../../../common/detection_engine/schemas/request/add_prepackaged_rules_schema';
|
||||
import { AlertsClient } from '../../../../../alerts/server';
|
||||
import { AlertsClient, PartialAlert } from '../../../../../alerts/server';
|
||||
import { patchRules } from './patch_rules';
|
||||
import { readRules } from './read_rules';
|
||||
import { PartialFilter } from '../types';
|
||||
import { PartialFilter, RuleTypeParams } from '../types';
|
||||
|
||||
/**
|
||||
* How many rules to update at a time is set to 50 from errors coming from
|
||||
* the slow environments such as cloud when the rule updates are > 100 we were
|
||||
* seeing timeout issues.
|
||||
*
|
||||
* Since there is not timeout options at the alerting API level right now, we are
|
||||
* at the mercy of the Elasticsearch server client/server default timeouts and what
|
||||
* we are doing could be considered a workaround to not being able to increase the timeouts.
|
||||
*
|
||||
* However, other bad effects and saturation of connections beyond 50 makes this a "noisy neighbor"
|
||||
* if we don't limit its number of connections as we increase the number of rules that can be
|
||||
* installed at a time.
|
||||
*
|
||||
* Lastly, we saw weird issues where Chrome on upstream 408 timeouts will re-call the REST route
|
||||
* which in turn could create additional connections we want to avoid.
|
||||
*
|
||||
* See file import_rules_route.ts for another area where 50 was chosen, therefore I chose
|
||||
* 50 here to mimic it as well. If you see this re-opened or what similar to it, consider
|
||||
* reducing the 50 above to a lower number.
|
||||
*
|
||||
* See the original ticket here:
|
||||
* https://github.com/elastic/kibana/issues/94418
|
||||
*/
|
||||
export const UPDATE_CHUNK_SIZE = 50;
|
||||
|
||||
/**
|
||||
* Updates the prepackaged rules given a set of rules and output index.
|
||||
* This implements a chunked approach to not saturate network connections and
|
||||
* avoid being a "noisy neighbor".
|
||||
* @param alertsClient Alerting client
|
||||
* @param savedObjectsClient Saved object client
|
||||
* @param rules The rules to apply the update for
|
||||
* @param outputIndex The output index to apply the update to.
|
||||
*/
|
||||
export const updatePrepackagedRules = async (
|
||||
alertsClient: AlertsClient,
|
||||
savedObjectsClient: SavedObjectsClientContract,
|
||||
rules: AddPrepackagedRulesSchemaDecoded[],
|
||||
outputIndex: string
|
||||
): Promise<void> => {
|
||||
await Promise.all(
|
||||
rules.map(async (rule) => {
|
||||
const {
|
||||
author,
|
||||
building_block_type: buildingBlockType,
|
||||
description,
|
||||
event_category_override: eventCategoryOverride,
|
||||
false_positives: falsePositives,
|
||||
from,
|
||||
query,
|
||||
language,
|
||||
license,
|
||||
saved_id: savedId,
|
||||
meta,
|
||||
filters: filtersObject,
|
||||
rule_id: ruleId,
|
||||
index,
|
||||
interval,
|
||||
max_signals: maxSignals,
|
||||
risk_score: riskScore,
|
||||
risk_score_mapping: riskScoreMapping,
|
||||
rule_name_override: ruleNameOverride,
|
||||
name,
|
||||
severity,
|
||||
severity_mapping: severityMapping,
|
||||
tags,
|
||||
to,
|
||||
type,
|
||||
threat,
|
||||
threshold,
|
||||
threat_filters: threatFilters,
|
||||
threat_index: threatIndex,
|
||||
threat_query: threatQuery,
|
||||
threat_mapping: threatMapping,
|
||||
threat_language: threatLanguage,
|
||||
concurrent_searches: concurrentSearches,
|
||||
items_per_search: itemsPerSearch,
|
||||
timestamp_override: timestampOverride,
|
||||
references,
|
||||
version,
|
||||
note,
|
||||
anomaly_threshold: anomalyThreshold,
|
||||
timeline_id: timelineId,
|
||||
timeline_title: timelineTitle,
|
||||
machine_learning_job_id: machineLearningJobId,
|
||||
exceptions_list: exceptionsList,
|
||||
} = rule;
|
||||
|
||||
const existingRule = await readRules({ alertsClient, ruleId, id: undefined });
|
||||
|
||||
// TODO: Fix these either with an is conversion or by better typing them within io-ts
|
||||
const filters: PartialFilter[] | undefined = filtersObject as PartialFilter[];
|
||||
|
||||
// Note: we do not pass down enabled as we do not want to suddenly disable
|
||||
// or enable rules on the user when they were not expecting it if a rule updates
|
||||
return patchRules({
|
||||
alertsClient,
|
||||
author,
|
||||
buildingBlockType,
|
||||
description,
|
||||
eventCategoryOverride,
|
||||
falsePositives,
|
||||
from,
|
||||
query,
|
||||
language,
|
||||
license,
|
||||
outputIndex,
|
||||
rule: existingRule,
|
||||
savedId,
|
||||
savedObjectsClient,
|
||||
meta,
|
||||
filters,
|
||||
index,
|
||||
interval,
|
||||
maxSignals,
|
||||
riskScore,
|
||||
riskScoreMapping,
|
||||
ruleNameOverride,
|
||||
name,
|
||||
severity,
|
||||
severityMapping,
|
||||
tags,
|
||||
timestampOverride,
|
||||
to,
|
||||
type,
|
||||
threat,
|
||||
threshold,
|
||||
threatFilters,
|
||||
threatIndex,
|
||||
threatQuery,
|
||||
threatMapping,
|
||||
threatLanguage,
|
||||
concurrentSearches,
|
||||
itemsPerSearch,
|
||||
references,
|
||||
version,
|
||||
note,
|
||||
anomalyThreshold,
|
||||
enabled: undefined,
|
||||
timelineId,
|
||||
timelineTitle,
|
||||
machineLearningJobId,
|
||||
exceptionsList,
|
||||
actions: undefined,
|
||||
});
|
||||
})
|
||||
);
|
||||
const ruleChunks = chunk(UPDATE_CHUNK_SIZE, rules);
|
||||
for (const ruleChunk of ruleChunks) {
|
||||
const rulePromises = createPromises(alertsClient, savedObjectsClient, ruleChunk, outputIndex);
|
||||
await Promise.all(rulePromises);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates promises of the rules and returns them.
|
||||
* @param alertsClient Alerting client
|
||||
* @param savedObjectsClient Saved object client
|
||||
* @param rules The rules to apply the update for
|
||||
* @param outputIndex The output index to apply the update to.
|
||||
* @returns Promise of what was updated.
|
||||
*/
|
||||
export const createPromises = (
|
||||
alertsClient: AlertsClient,
|
||||
savedObjectsClient: SavedObjectsClientContract,
|
||||
rules: AddPrepackagedRulesSchemaDecoded[],
|
||||
outputIndex: string
|
||||
): Array<Promise<PartialAlert<RuleTypeParams> | null>> => {
|
||||
return rules.map(async (rule) => {
|
||||
const {
|
||||
author,
|
||||
building_block_type: buildingBlockType,
|
||||
description,
|
||||
event_category_override: eventCategoryOverride,
|
||||
false_positives: falsePositives,
|
||||
from,
|
||||
query,
|
||||
language,
|
||||
license,
|
||||
saved_id: savedId,
|
||||
meta,
|
||||
filters: filtersObject,
|
||||
rule_id: ruleId,
|
||||
index,
|
||||
interval,
|
||||
max_signals: maxSignals,
|
||||
risk_score: riskScore,
|
||||
risk_score_mapping: riskScoreMapping,
|
||||
rule_name_override: ruleNameOverride,
|
||||
name,
|
||||
severity,
|
||||
severity_mapping: severityMapping,
|
||||
tags,
|
||||
to,
|
||||
type,
|
||||
threat,
|
||||
threshold,
|
||||
threat_filters: threatFilters,
|
||||
threat_index: threatIndex,
|
||||
threat_query: threatQuery,
|
||||
threat_mapping: threatMapping,
|
||||
threat_language: threatLanguage,
|
||||
concurrent_searches: concurrentSearches,
|
||||
items_per_search: itemsPerSearch,
|
||||
timestamp_override: timestampOverride,
|
||||
references,
|
||||
version,
|
||||
note,
|
||||
anomaly_threshold: anomalyThreshold,
|
||||
timeline_id: timelineId,
|
||||
timeline_title: timelineTitle,
|
||||
machine_learning_job_id: machineLearningJobId,
|
||||
exceptions_list: exceptionsList,
|
||||
} = rule;
|
||||
|
||||
const existingRule = await readRules({ alertsClient, ruleId, id: undefined });
|
||||
|
||||
// TODO: Fix these either with an is conversion or by better typing them within io-ts
|
||||
const filters: PartialFilter[] | undefined = filtersObject as PartialFilter[];
|
||||
|
||||
// Note: we do not pass down enabled as we do not want to suddenly disable
|
||||
// or enable rules on the user when they were not expecting it if a rule updates
|
||||
return patchRules({
|
||||
alertsClient,
|
||||
author,
|
||||
buildingBlockType,
|
||||
description,
|
||||
eventCategoryOverride,
|
||||
falsePositives,
|
||||
from,
|
||||
query,
|
||||
language,
|
||||
license,
|
||||
outputIndex,
|
||||
rule: existingRule,
|
||||
savedId,
|
||||
savedObjectsClient,
|
||||
meta,
|
||||
filters,
|
||||
index,
|
||||
interval,
|
||||
maxSignals,
|
||||
riskScore,
|
||||
riskScoreMapping,
|
||||
ruleNameOverride,
|
||||
name,
|
||||
severity,
|
||||
severityMapping,
|
||||
tags,
|
||||
timestampOverride,
|
||||
to,
|
||||
type,
|
||||
threat,
|
||||
threshold,
|
||||
threatFilters,
|
||||
threatIndex,
|
||||
threatQuery,
|
||||
threatMapping,
|
||||
threatLanguage,
|
||||
concurrentSearches,
|
||||
itemsPerSearch,
|
||||
references,
|
||||
version,
|
||||
note,
|
||||
anomalyThreshold,
|
||||
enabled: undefined,
|
||||
timelineId,
|
||||
timelineTitle,
|
||||
machineLearningJobId,
|
||||
exceptionsList,
|
||||
actions: undefined,
|
||||
});
|
||||
});
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue