[8.16] [ML] File upload fixing PDF character count limit (#197333) (#197601)

# Backport

This will backport the following commits from `main` to `8.16`:
- [[ML] File upload fixing PDF character count limit
(#197333)](https://github.com/elastic/kibana/pull/197333)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"James
Gowdy","email":"jgowdy@elastic.co"},"sourceCommit":{"committedDate":"2024-10-24T10:47:58Z","message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100","branchLabelMapping":{"^v9.0.0$":"main","^v8.17.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix",":ml","Feature:File
and Index Data Viz","Feature:File
Upload","v9.0.0","v8.16.0","backport:version","v8.17.0"],"title":"[ML]
File upload fixing PDF character count
limit","number":197333,"url":"https://github.com/elastic/kibana/pull/197333","mergeCommit":{"message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100"}},"sourceBranch":"main","suggestedTargetBranches":["8.16","8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.0.0","branchLabelMappingKey":"^v9.0.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/197333","number":197333,"mergeCommit":{"message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100"}},{"branch":"8.16","label":"v8.16.0","branchLabelMappingKey":"^v(\\d+).(\\d+).\\d+$","isSourceBranch":false,"state":"NOT_CREATED"},{"branch":"8.x","label":"v8.17.0","branchLabelMappingKey":"^v8.17.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->

Co-authored-by: James Gowdy <jgowdy@elastic.co>
This commit is contained in:
Kibana Machine 2024-10-24 23:37:20 +11:00 committed by GitHub
parent b254359d8f
commit 85159e3c59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 27 additions and 8 deletions

View file

@ -18,9 +18,10 @@ import {
EuiSwitch,
} from '@elastic/eui';
import type { FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import { TIKA_PREVIEW_CHARS, type FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import useMountedState from 'react-use/lib/useMountedState';
import { i18n } from '@kbn/i18n';
import { FILE_FORMATS } from '../../../../../common/constants';
import { EDITOR_MODE, JsonEditor } from '../json_editor';
import { useGrokHighlighter } from './use_text_parser';
import { LINE_LIMIT } from './grok_highlighter';
@ -132,13 +133,23 @@ export const FileContents: FC<Props> = ({
<EuiSpacer size="s" />
<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.firstLinesDescription"
defaultMessage="First {numberOfLines, plural, zero {# line} one {# line} other {# lines}}"
values={{
numberOfLines: showHighlights ? LINE_LIMIT : numberOfLines,
}}
/>
{format === FILE_FORMATS.TIKA ? (
<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.characterCount"
defaultMessage="Preview limited to the first {numberOfChars} characters"
values={{
numberOfChars: TIKA_PREVIEW_CHARS,
}}
/>
) : (
<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.firstLinesDescription"
defaultMessage="First {numberOfLines, plural, zero {# line} one {# line} other {# lines}}"
values={{
numberOfLines: showHighlights ? LINE_LIMIT : numberOfLines,
}}
/>
)}
<EuiSpacer size="s" />

View file

@ -98,6 +98,8 @@ export async function analyzeTikaFile(
attachment: {
field: 'data',
remove_binary: true,
// unlimited character count
indexed_chars: -1,
},
},
],

View file

@ -23,3 +23,5 @@ export const FILE_FORMATS = {
SEMI_STRUCTURED_TEXT: 'semi_structured_text',
TIKA: 'tika',
};
export const TIKA_PREVIEW_CHARS = 100000;

View file

@ -15,3 +15,5 @@ export type {
InputOverrides,
IngestPipeline,
} from './types';
export { TIKA_PREVIEW_CHARS } from './constants';

View file

@ -7,6 +7,7 @@
import type { IScopedClusterClient } from '@kbn/core/server';
import type { PreviewTikaResponse } from '../common/types';
import { TIKA_PREVIEW_CHARS } from '../common/constants';
/**
* Returns the contents of a file using the attachment ingest processor
@ -24,6 +25,7 @@ export async function previewTikaContents(
attachment: {
field: 'data',
remove_binary: true,
indexed_chars: TIKA_PREVIEW_CHARS,
},
},
],