Replaced imghdr with filetype for image detection

This commit is contained in:
remanifest 2025-04-10 20:10:50 -05:00 committed by GitHub
parent 2c7294de0d
commit 9825a3a109
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 2894 additions and 0 deletions

11
custom_libs/imghdr.py Normal file
View file

@ -0,0 +1,11 @@
import filetype
_IMG_MIME = {
'image/jpeg': 'jpeg',
'image/png': 'png',
'image/gif': 'gif'
}
def what(_, img):
img_type = filetype.guess(img)
return _IMG_MIME.get(img_type.mime) if img_type else None

8
libs/bin/filetype Executable file
View file

@ -0,0 +1,8 @@
#!/usr/local/opt/python@3.8/bin/python3.8
# -*- coding: utf-8 -*-
import re
import sys
from filetype.__main__ import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View file

@ -0,0 +1 @@
pip

View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 Tomás Aparicio
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,212 @@
Metadata-Version: 2.1
Name: filetype
Version: 1.2.0
Summary: Infer file type and MIME type of any file/buffer. No external dependencies.
Home-page: https://github.com/h2non/filetype.py
Download-URL: https://github.com/h2non/filetype.py/tarball/master
Author: Tomas Aparicio
Author-email: tomas@aparicio.me
License: MIT
Keywords: file libmagic magic infer numbers magicnumbers discovery mime type kind
Platform: any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: System Administrators
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Topic :: System
Classifier: Topic :: System :: Filesystems
Classifier: Topic :: Utilities
License-File: LICENSE
filetype.py |Build Status| |PyPI| |Pyversions| |API|
====================================================
Small and dependency free `Python`_ package to infer file type and MIME
type checking the `magic numbers`_ signature of a file or buffer.
This is a Python port from `filetype`_ Go package.
Features
--------
- Simple and friendly API
- Supports a `wide range`_ of file types
- Provides file extension and MIME type inference
- File discovery by extension or MIME type
- File discovery by kind (image, video, audio…)
- `Pluggable`_: add new custom type matchers
- `Fast`_, even processing large files
- Only first 261 bytes representing the max file header is required, so
you can just `pass a list of bytes`_
- Dependency free (just Python code, no C extensions, no libmagic
bindings)
- Cross-platform file recognition
Installation
------------
::
pip install filetype
API
---
See `annotated API reference`_.
Examples
--------
Simple file type checking
^^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: python
import filetype
def main():
kind = filetype.guess('tests/fixtures/sample.jpg')
if kind is None:
print('Cannot guess file type!')
return
print('File extension: %s' % kind.extension)
print('File MIME type: %s' % kind.mime)
if __name__ == '__main__':
main()
Supported types
---------------
Image
^^^^^
- **dwg** - ``image/vnd.dwg``
- **xcf** - ``image/x-xcf``
- **jpg** - ``image/jpeg``
- **jpx** - ``image/jpx``
- **png** - ``image/png``
- **apng** - ``image/apng``
- **gif** - ``image/gif``
- **webp** - ``image/webp``
- **cr2** - ``image/x-canon-cr2``
- **tif** - ``image/tiff``
- **bmp** - ``image/bmp``
- **jxr** - ``image/vnd.ms-photo``
- **psd** - ``image/vnd.adobe.photoshop``
- **ico** - ``image/x-icon``
- **heic** - ``image/heic``
- **avif** - ``image/avif``
Video
^^^^^
- **3gp** - ``video/3gpp``
- **mp4** - ``video/mp4``
- **m4v** - ``video/x-m4v``
- **mkv** - ``video/x-matroska``
- **webm** - ``video/webm``
- **mov** - ``video/quicktime``
- **avi** - ``video/x-msvideo``
- **wmv** - ``video/x-ms-wmv``
- **mpg** - ``video/mpeg``
- **flv** - ``video/x-flv``
Audio
^^^^^
- **aac** - ``audio/aac``
- **mid** - ``audio/midi``
- **mp3** - ``audio/mpeg``
- **m4a** - ``audio/mp4``
- **ogg** - ``audio/ogg``
- **flac** - ``audio/x-flac``
- **wav** - ``audio/x-wav``
- **amr** - ``audio/amr``
- **aiff** - ``audio/x-aiff``
Archive
^^^^^^^
- **br** - ``application/x-brotli``
- **rpm** - ``application/x-rpm``
- **dcm** - ``application/dicom``
- **epub** - ``application/epub+zip``
- **zip** - ``application/zip``
- **tar** - ``application/x-tar``
- **rar** - ``application/x-rar-compressed``
- **gz** - ``application/gzip``
- **bz2** - ``application/x-bzip2``
- **7z** - ``application/x-7z-compressed``
- **xz** - ``application/x-xz``
- **pdf** - ``application/pdf``
- **exe** - ``application/x-msdownload``
- **swf** - ``application/x-shockwave-flash``
- **rtf** - ``application/rtf``
- **eot** - ``application/octet-stream``
- **ps** - ``application/postscript``
- **sqlite** - ``application/x-sqlite3``
- **nes** - ``application/x-nintendo-nes-rom``
- **crx** - ``application/x-google-chrome-extension``
- **cab** - ``application/vnd.ms-cab-compressed``
- **deb** - ``application/x-deb``
- **ar** - ``application/x-unix-archive``
- **Z** - ``application/x-compress``
- **lzo** - ``application/x-lzop``
- **lz** - ``application/x-lzip``
- **lz4** - ``application/x-lz4``
- **zstd** - ``application/zstd``
Document
^^^^^^^^
- **doc** - ``application/msword``
- **docx** - ``application/vnd.openxmlformats-officedocument.wordprocessingml.document``
- **odt** - ``application/vnd.oasis.opendocument.text``
- **xls** - ``application/vnd.ms-excel``
- **xlsx** - ``application/vnd.openxmlformats-officedocument.spreadsheetml.sheet``
- **ods** - ``application/vnd.oasis.opendocument.spreadsheet``
- **ppt** - ``application/vnd.ms-powerpoint``
- **pptx** - ``application/vnd.openxmlformats-officedocument.presentationml.presentation``
- **odp** - ``application/vnd.oasis.opendocument.presentation``
Font
^^^^
- **woff** - ``application/font-woff``
- **woff2** - ``application/font-woff``
- **ttf** - ``application/font-sfnt``
- **otf** - ``application/font-sfnt``
Application
^^^^^^^^^^^
- **wasm** - ``application/wasm``
.. _Python: http://python.org
.. _magic numbers: https://en.wikipedia.org/wiki/Magic_number_(programming)#Magic_numbers_in_files
.. _filetype: https://github.com/h2non/filetype
.. _wide range: #supported-types
.. _Pluggable: #add-additional-file-type-matchers
.. _Fast: #benchmarks
.. _pass a list of bytes: #file-header
.. _annotated API reference: https://h2non.github.io/filetype.py/
.. |Build Status| image:: https://travis-ci.org/h2non/filetype.py.svg?branch=master
:target: https://travis-ci.org/h2non/filetype.py
.. |PyPI| image:: https://img.shields.io/pypi/v/filetype.svg?maxAge=2592000?style=flat-square
:target: https://pypi.python.org/pypi/filetype
.. |Pyversions| image:: https://img.shields.io/pypi/pyversions/filetype.svg?style=flat-square
:target: https://pypi.python.org/pypi/filetype
.. |API| image:: https://img.shields.io/badge/api-docs-green.svg
:target: https://h2non.github.io/filetype.py

View file

@ -0,0 +1,26 @@
../../bin/filetype,sha256=qhZzWIjpierkU7BFhcUqdo8jpXcpw4t1BOBkR_UK1fY,239
filetype-1.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
filetype-1.2.0.dist-info/LICENSE,sha256=jkTiqjWzcb3MhWvPDSRCpBDdVf3maw38L83wdtl5Rqw,1082
filetype-1.2.0.dist-info/METADATA,sha256=IukENsJLniEMUy5Kauljm-JAi2RR1th0YD3E7hXw8UM,6532
filetype-1.2.0.dist-info/RECORD,,
filetype-1.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
filetype-1.2.0.dist-info/WHEEL,sha256=P2T-6epvtXQ2cBOE_U1K4_noqlJFN3tj15djMgEu4NM,110
filetype-1.2.0.dist-info/entry_points.txt,sha256=FW9vQKv-y3mEcT51mUaPeIu3vixzcr6WdLfjD4SFUVM,52
filetype-1.2.0.dist-info/top_level.txt,sha256=9E4F1bIRPoq5TGtC-BHwM1_svcsWYRiC0N_qAGrlW0Y,9
filetype-1.2.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
filetype/__init__.py,sha256=7c1C2XIbB7md1oI-0nwBzxoD52he_7NYry3YQV5OXa8,223
filetype/__main__.py,sha256=4-2VK-0hB2mLL_HukB8cOa0jsQKLq95gG4UhCPqF0rg,803
filetype/filetype.py,sha256=SBYUBugfBQSO9z7zyWaXOak6UpLUlmZZ--5FpN0fybM,2122
filetype/helpers.py,sha256=O0hofWlmG8J_X81IuQ8KszvjgnUb-O6BzO-wUJRTLV0,2947
filetype/match.py,sha256=XUHst4XDmYlJtfYAMlGuySl2IWia2UoCb8NIDYiCRgI,3288
filetype/types/__init__.py,sha256=baH8xCYyatykxtCUccgGGIwNdit6x5jGsXxWTvpo4t8,2085
filetype/types/application.py,sha256=6Knc4Y38GbfuoSjdPl29vIsusjNIPjLWVk22nxCnS9I,498
filetype/types/archive.py,sha256=kZWEHJmJ1NmQT0Hm-JmEHI1TXg5NrzxJ4YCbfV-6y8c,17006
filetype/types/audio.py,sha256=oOAS-cdA175rELcK_17w-gylJkmSh8FTrAoVAOwsfUA,4960
filetype/types/base.py,sha256=dvvqVjuSqwtbh2qyP7QnmeWUWUsfrHwJ_rOEgJmDQZ8,647
filetype/types/document.py,sha256=mxOhuymNIpsqMWCgy-Fm8vkSgDoSeCXYADxmO1JPx6Q,7513
filetype/types/font.py,sha256=nP5Ey-EcKMU4phGYtIlQ08I5cecWnr5vzDLVbiPOiyY,2924
filetype/types/image.py,sha256=r8pINANPJZbCEmZKn8F74fFffk4INtDin_GtQtQImZs,9130
filetype/types/isobmff.py,sha256=zLXCbTET6wp_9yq8jE3bhBRTaCdSAKma5ElyHVGd2Sk,958
filetype/types/video.py,sha256=DfkFd5ofnEK25r_n71LxjX3nAAgO8xJ7Op_lL9uEbNc,5371
filetype/utils.py,sha256=sjZCMfYawZ6RWN1Dr3jDmsqIjLSEBFubNgi8HROjaPQ,2089

View file

View file

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.41.3)
Root-Is-Purelib: true
Tag: py2-none-any
Tag: py3-none-any

View file

@ -0,0 +1,2 @@
[console_scripts]
filetype = filetype.__main__:main

View file

@ -0,0 +1 @@
filetype

View file

@ -0,0 +1 @@

10
libs/filetype/__init__.py Normal file
View file

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .filetype import * # noqa
from .helpers import * # noqa
from .match import * # noqa
# Current package semver version
__version__ = version = '1.2.0'

37
libs/filetype/__main__.py Normal file
View file

@ -0,0 +1,37 @@
import sys
import filetype
def guess(path):
kind = filetype.guess(path)
if kind is None:
print('{}: File type determination failure.'.format(path))
else:
print('{}: {} ({})'.format(path, kind.extension, kind.mime))
def main():
import argparse
parser = argparse.ArgumentParser(
prog='filetype', description='Determine type of FILEs.'
)
parser.add_argument('-f', '--file', nargs='+')
parser.add_argument(
'-v', '--version', action='version',
version='%(prog)s ' + filetype.version,
help='output version information and exit'
)
args = parser.parse_args()
if len(sys.argv) < 2:
parser.print_help()
sys.exit(1)
for i in args.file:
guess(i)
if __name__ == '__main__':
main()

98
libs/filetype/filetype.py Normal file
View file

@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .match import match
from .types import TYPES, Type
# Expose supported matchers types
types = TYPES
def guess(obj):
"""
Infers the type of the given input.
Function is overloaded to accept multiple types in input
and peform the needed type inference based on it.
Args:
obj: path to file, bytes or bytearray.
Returns:
The matched type instance. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj) if obj else None
def guess_mime(obj):
"""
Infers the file type of the given input
and returns its MIME type.
Args:
obj: path to file, bytes or bytearray.
Returns:
The matched MIME type as string. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
kind = guess(obj)
return kind.mime if kind else kind
def guess_extension(obj):
"""
Infers the file type of the given input
and returns its RFC file extension.
Args:
obj: path to file, bytes or bytearray.
Returns:
The matched file extension as string. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
kind = guess(obj)
return kind.extension if kind else kind
def get_type(mime=None, ext=None):
"""
Returns the file type instance searching by
MIME type or file extension.
Args:
ext: file extension string. E.g: jpg, png, mp4, mp3
mime: MIME string. E.g: image/jpeg, video/mpeg
Returns:
The matched file type instance. Otherwise None.
"""
for kind in types:
if kind.extension == ext or kind.mime == mime:
return kind
return None
def add_type(instance):
"""
Adds a new type matcher instance to the supported types.
Args:
instance: Type inherited instance.
Returns:
None
"""
if not isinstance(instance, Type):
raise TypeError('instance must inherit from filetype.types.Type')
types.insert(0, instance)

140
libs/filetype/helpers.py Normal file
View file

@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .types import TYPES
from .match import (
image_match, font_match, document_match,
video_match, audio_match, archive_match
)
def is_extension_supported(ext):
"""
Checks if the given extension string is
one of the supported by the file matchers.
Args:
ext (str): file extension string. E.g: jpg, png, mp4, mp3
Returns:
True if the file extension is supported.
Otherwise False.
"""
for kind in TYPES:
if kind.extension == ext:
return True
return False
def is_mime_supported(mime):
"""
Checks if the given MIME type string is
one of the supported by the file matchers.
Args:
mime (str): MIME string. E.g: image/jpeg, video/mpeg
Returns:
True if the MIME type is supported.
Otherwise False.
"""
for kind in TYPES:
if kind.mime == mime:
return True
return False
def is_image(obj):
"""
Checks if a given input is a supported type image.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid image. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return image_match(obj) is not None
def is_archive(obj):
"""
Checks if a given input is a supported type archive.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid archive. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return archive_match(obj) is not None
def is_audio(obj):
"""
Checks if a given input is a supported type audio.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid audio. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return audio_match(obj) is not None
def is_video(obj):
"""
Checks if a given input is a supported type video.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid video. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return video_match(obj) is not None
def is_font(obj):
"""
Checks if a given input is a supported type font.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid font. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return font_match(obj) is not None
def is_document(obj):
"""
Checks if a given input is a supported type document.
Args:
obj: path to file, bytes or bytearray.
Returns:
True if obj is a valid document. Otherwise False.
Raises:
TypeError: if obj is not a supported type.
"""
return document_match(obj) is not None

155
libs/filetype/match.py Normal file
View file

@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .types import ARCHIVE as archive_matchers
from .types import AUDIO as audio_matchers
from .types import APPLICATION as application_matchers
from .types import DOCUMENT as document_matchers
from .types import FONT as font_matchers
from .types import IMAGE as image_matchers
from .types import VIDEO as video_matchers
from .types import TYPES
from .utils import get_bytes
def match(obj, matchers=TYPES):
"""
Matches the given input against the available
file type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if type matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
buf = get_bytes(obj)
for matcher in matchers:
if matcher.match(buf):
return matcher
return None
def image_match(obj):
"""
Matches the given input against the available
image type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, image_matchers)
def font_match(obj):
"""
Matches the given input against the available
font type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, font_matchers)
def video_match(obj):
"""
Matches the given input against the available
video type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, video_matchers)
def audio_match(obj):
"""
Matches the given input against the available
autio type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, audio_matchers)
def archive_match(obj):
"""
Matches the given input against the available
archive type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, archive_matchers)
def application_match(obj):
"""
Matches the given input against the available
application type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, application_matchers)
def document_match(obj):
"""
Matches the given input against the available
document type matchers.
Args:
obj: path to file, bytes or bytearray.
Returns:
Type instance if matches. Otherwise None.
Raises:
TypeError: if obj is not a supported type.
"""
return match(obj, document_matchers)

View file

@ -0,0 +1,118 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import archive
from . import audio
from . import application
from . import document
from . import font
from . import image
from . import video
from .base import Type # noqa
# Supported image types
IMAGE = (
image.Dwg(),
image.Xcf(),
image.Jpeg(),
image.Jpx(),
image.Apng(),
image.Png(),
image.Gif(),
image.Webp(),
image.Tiff(),
image.Cr2(),
image.Bmp(),
image.Jxr(),
image.Psd(),
image.Ico(),
image.Heic(),
image.Dcm(),
image.Avif(),
)
# Supported video types
VIDEO = (
video.M3gp(),
video.Mp4(),
video.M4v(),
video.Mkv(),
video.Mov(),
video.Avi(),
video.Wmv(),
video.Mpeg(),
video.Webm(),
video.Flv(),
)
# Supported audio types
AUDIO = (
audio.Aac(),
audio.Midi(),
audio.Mp3(),
audio.M4a(),
audio.Ogg(),
audio.Flac(),
audio.Wav(),
audio.Amr(),
audio.Aiff(),
)
# Supported font types
FONT = (font.Woff(), font.Woff2(), font.Ttf(), font.Otf())
# Supported archive container types
ARCHIVE = (
archive.Br(),
archive.Rpm(),
archive.Dcm(),
archive.Epub(),
archive.Zip(),
archive.Tar(),
archive.Rar(),
archive.Gz(),
archive.Bz2(),
archive.SevenZ(),
archive.Pdf(),
archive.Exe(),
archive.Swf(),
archive.Rtf(),
archive.Nes(),
archive.Crx(),
archive.Cab(),
archive.Eot(),
archive.Ps(),
archive.Xz(),
archive.Sqlite(),
archive.Deb(),
archive.Ar(),
archive.Z(),
archive.Lzop(),
archive.Lz(),
archive.Elf(),
archive.Lz4(),
archive.Zstd(),
)
# Supported archive container types
APPLICATION = (
application.Wasm(),
)
# Supported document types
DOCUMENT = (
document.Doc(),
document.Docx(),
document.Odt(),
document.Xls(),
document.Xlsx(),
document.Ods(),
document.Ppt(),
document.Pptx(),
document.Odp(),
)
# Expose supported type matchers
TYPES = list(IMAGE + AUDIO + VIDEO + FONT + DOCUMENT + ARCHIVE + APPLICATION)

View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
class Wasm(Type):
"""Implements the Wasm image type matcher."""
MIME = 'application/wasm'
EXTENSION = 'wasm'
def __init__(self):
super(Wasm, self).__init__(
mime=Wasm.MIME,
extension=Wasm.EXTENSION
)
def match(self, buf):
return buf[:8] == bytearray([0x00, 0x61, 0x73, 0x6d,
0x01, 0x00, 0x00, 0x00])

View file

@ -0,0 +1,687 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import struct
from .base import Type
class Epub(Type):
"""
Implements the EPUB archive type matcher.
"""
MIME = 'application/epub+zip'
EXTENSION = 'epub'
def __init__(self):
super(Epub, self).__init__(
mime=Epub.MIME,
extension=Epub.EXTENSION
)
def match(self, buf):
return (len(buf) > 57 and
buf[0] == 0x50 and buf[1] == 0x4B and
buf[2] == 0x3 and buf[3] == 0x4 and
buf[30] == 0x6D and buf[31] == 0x69 and
buf[32] == 0x6D and buf[33] == 0x65 and
buf[34] == 0x74 and buf[35] == 0x79 and
buf[36] == 0x70 and buf[37] == 0x65 and
buf[38] == 0x61 and buf[39] == 0x70 and
buf[40] == 0x70 and buf[41] == 0x6C and
buf[42] == 0x69 and buf[43] == 0x63 and
buf[44] == 0x61 and buf[45] == 0x74 and
buf[46] == 0x69 and buf[47] == 0x6F and
buf[48] == 0x6E and buf[49] == 0x2F and
buf[50] == 0x65 and buf[51] == 0x70 and
buf[52] == 0x75 and buf[53] == 0x62 and
buf[54] == 0x2B and buf[55] == 0x7A and
buf[56] == 0x69 and buf[57] == 0x70)
class Zip(Type):
"""
Implements the Zip archive type matcher.
"""
MIME = 'application/zip'
EXTENSION = 'zip'
def __init__(self):
super(Zip, self).__init__(
mime=Zip.MIME,
extension=Zip.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x50 and buf[1] == 0x4B and
(buf[2] == 0x3 or buf[2] == 0x5 or
buf[2] == 0x7) and
(buf[3] == 0x4 or buf[3] == 0x6 or
buf[3] == 0x8))
class Tar(Type):
"""
Implements the Tar archive type matcher.
"""
MIME = 'application/x-tar'
EXTENSION = 'tar'
def __init__(self):
super(Tar, self).__init__(
mime=Tar.MIME,
extension=Tar.EXTENSION
)
def match(self, buf):
return (len(buf) > 261 and
buf[257] == 0x75 and
buf[258] == 0x73 and
buf[259] == 0x74 and
buf[260] == 0x61 and
buf[261] == 0x72)
class Rar(Type):
"""
Implements the RAR archive type matcher.
"""
MIME = 'application/x-rar-compressed'
EXTENSION = 'rar'
def __init__(self):
super(Rar, self).__init__(
mime=Rar.MIME,
extension=Rar.EXTENSION
)
def match(self, buf):
return (len(buf) > 6 and
buf[0] == 0x52 and
buf[1] == 0x61 and
buf[2] == 0x72 and
buf[3] == 0x21 and
buf[4] == 0x1A and
buf[5] == 0x7 and
(buf[6] == 0x0 or
buf[6] == 0x1))
class Gz(Type):
"""
Implements the GZ archive type matcher.
"""
MIME = 'application/gzip'
EXTENSION = 'gz'
def __init__(self):
super(Gz, self).__init__(
mime=Gz.MIME,
extension=Gz.EXTENSION
)
def match(self, buf):
return (len(buf) > 2 and
buf[0] == 0x1F and
buf[1] == 0x8B and
buf[2] == 0x8)
class Bz2(Type):
"""
Implements the BZ2 archive type matcher.
"""
MIME = 'application/x-bzip2'
EXTENSION = 'bz2'
def __init__(self):
super(Bz2, self).__init__(
mime=Bz2.MIME,
extension=Bz2.EXTENSION
)
def match(self, buf):
return (len(buf) > 2 and
buf[0] == 0x42 and
buf[1] == 0x5A and
buf[2] == 0x68)
class SevenZ(Type):
"""
Implements the SevenZ (7z) archive type matcher.
"""
MIME = 'application/x-7z-compressed'
EXTENSION = '7z'
def __init__(self):
super(SevenZ, self).__init__(
mime=SevenZ.MIME,
extension=SevenZ.EXTENSION
)
def match(self, buf):
return (len(buf) > 5 and
buf[0] == 0x37 and
buf[1] == 0x7A and
buf[2] == 0xBC and
buf[3] == 0xAF and
buf[4] == 0x27 and
buf[5] == 0x1C)
class Pdf(Type):
"""
Implements the PDF archive type matcher.
"""
MIME = 'application/pdf'
EXTENSION = 'pdf'
def __init__(self):
super(Pdf, self).__init__(
mime=Pdf.MIME,
extension=Pdf.EXTENSION
)
def match(self, buf):
# Detect BOM and skip first 3 bytes
if (len(buf) > 3 and
buf[0] == 0xEF and
buf[1] == 0xBB and
buf[2] == 0xBF): # noqa E129
buf = buf[3:]
return (len(buf) > 3 and
buf[0] == 0x25 and
buf[1] == 0x50 and
buf[2] == 0x44 and
buf[3] == 0x46)
class Exe(Type):
"""
Implements the EXE archive type matcher.
"""
MIME = 'application/x-msdownload'
EXTENSION = 'exe'
def __init__(self):
super(Exe, self).__init__(
mime=Exe.MIME,
extension=Exe.EXTENSION
)
def match(self, buf):
return (len(buf) > 1 and
buf[0] == 0x4D and
buf[1] == 0x5A)
class Swf(Type):
"""
Implements the SWF archive type matcher.
"""
MIME = 'application/x-shockwave-flash'
EXTENSION = 'swf'
def __init__(self):
super(Swf, self).__init__(
mime=Swf.MIME,
extension=Swf.EXTENSION
)
def match(self, buf):
return (len(buf) > 2 and
(buf[0] == 0x43 or
buf[0] == 0x46) and
buf[1] == 0x57 and
buf[2] == 0x53)
class Rtf(Type):
"""
Implements the RTF archive type matcher.
"""
MIME = 'application/rtf'
EXTENSION = 'rtf'
def __init__(self):
super(Rtf, self).__init__(
mime=Rtf.MIME,
extension=Rtf.EXTENSION
)
def match(self, buf):
return (len(buf) > 4 and
buf[0] == 0x7B and
buf[1] == 0x5C and
buf[2] == 0x72 and
buf[3] == 0x74 and
buf[4] == 0x66)
class Nes(Type):
"""
Implements the NES archive type matcher.
"""
MIME = 'application/x-nintendo-nes-rom'
EXTENSION = 'nes'
def __init__(self):
super(Nes, self).__init__(
mime=Nes.MIME,
extension=Nes.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x4E and
buf[1] == 0x45 and
buf[2] == 0x53 and
buf[3] == 0x1A)
class Crx(Type):
"""
Implements the CRX archive type matcher.
"""
MIME = 'application/x-google-chrome-extension'
EXTENSION = 'crx'
def __init__(self):
super(Crx, self).__init__(
mime=Crx.MIME,
extension=Crx.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x43 and
buf[1] == 0x72 and
buf[2] == 0x32 and
buf[3] == 0x34)
class Cab(Type):
"""
Implements the CAB archive type matcher.
"""
MIME = 'application/vnd.ms-cab-compressed'
EXTENSION = 'cab'
def __init__(self):
super(Cab, self).__init__(
mime=Cab.MIME,
extension=Cab.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
((buf[0] == 0x4D and
buf[1] == 0x53 and
buf[2] == 0x43 and
buf[3] == 0x46) or
(buf[0] == 0x49 and
buf[1] == 0x53 and
buf[2] == 0x63 and
buf[3] == 0x28)))
class Eot(Type):
"""
Implements the EOT archive type matcher.
"""
MIME = 'application/octet-stream'
EXTENSION = 'eot'
def __init__(self):
super(Eot, self).__init__(
mime=Eot.MIME,
extension=Eot.EXTENSION
)
def match(self, buf):
return (len(buf) > 35 and
buf[34] == 0x4C and
buf[35] == 0x50 and
((buf[8] == 0x02 and
buf[9] == 0x00 and
buf[10] == 0x01) or
(buf[8] == 0x01 and
buf[9] == 0x00 and
buf[10] == 0x00) or
(buf[8] == 0x02 and
buf[9] == 0x00 and
buf[10] == 0x02)))
class Ps(Type):
"""
Implements the PS archive type matcher.
"""
MIME = 'application/postscript'
EXTENSION = 'ps'
def __init__(self):
super(Ps, self).__init__(
mime=Ps.MIME,
extension=Ps.EXTENSION
)
def match(self, buf):
return (len(buf) > 1 and
buf[0] == 0x25 and
buf[1] == 0x21)
class Xz(Type):
"""
Implements the XS archive type matcher.
"""
MIME = 'application/x-xz'
EXTENSION = 'xz'
def __init__(self):
super(Xz, self).__init__(
mime=Xz.MIME,
extension=Xz.EXTENSION
)
def match(self, buf):
return (len(buf) > 5 and
buf[0] == 0xFD and
buf[1] == 0x37 and
buf[2] == 0x7A and
buf[3] == 0x58 and
buf[4] == 0x5A and
buf[5] == 0x00)
class Sqlite(Type):
"""
Implements the Sqlite DB archive type matcher.
"""
MIME = 'application/x-sqlite3'
EXTENSION = 'sqlite'
def __init__(self):
super(Sqlite, self).__init__(
mime=Sqlite.MIME,
extension=Sqlite.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x53 and
buf[1] == 0x51 and
buf[2] == 0x4C and
buf[3] == 0x69)
class Deb(Type):
"""
Implements the DEB archive type matcher.
"""
MIME = 'application/x-deb'
EXTENSION = 'deb'
def __init__(self):
super(Deb, self).__init__(
mime=Deb.MIME,
extension=Deb.EXTENSION
)
def match(self, buf):
return (len(buf) > 20 and
buf[0] == 0x21 and
buf[1] == 0x3C and
buf[2] == 0x61 and
buf[3] == 0x72 and
buf[4] == 0x63 and
buf[5] == 0x68 and
buf[6] == 0x3E and
buf[7] == 0x0A and
buf[8] == 0x64 and
buf[9] == 0x65 and
buf[10] == 0x62 and
buf[11] == 0x69 and
buf[12] == 0x61 and
buf[13] == 0x6E and
buf[14] == 0x2D and
buf[15] == 0x62 and
buf[16] == 0x69 and
buf[17] == 0x6E and
buf[18] == 0x61 and
buf[19] == 0x72 and
buf[20] == 0x79)
class Ar(Type):
"""
Implements the AR archive type matcher.
"""
MIME = 'application/x-unix-archive'
EXTENSION = 'ar'
def __init__(self):
super(Ar, self).__init__(
mime=Ar.MIME,
extension=Ar.EXTENSION
)
def match(self, buf):
return (len(buf) > 6 and
buf[0] == 0x21 and
buf[1] == 0x3C and
buf[2] == 0x61 and
buf[3] == 0x72 and
buf[4] == 0x63 and
buf[5] == 0x68 and
buf[6] == 0x3E)
class Z(Type):
"""
Implements the Z archive type matcher.
"""
MIME = 'application/x-compress'
EXTENSION = 'Z'
def __init__(self):
super(Z, self).__init__(
mime=Z.MIME,
extension=Z.EXTENSION
)
def match(self, buf):
return (len(buf) > 1 and
((buf[0] == 0x1F and
buf[1] == 0xA0) or
(buf[0] == 0x1F and
buf[1] == 0x9D)))
class Lzop(Type):
"""
Implements the Lzop archive type matcher.
"""
MIME = 'application/x-lzop'
EXTENSION = 'lzo'
def __init__(self):
super(Lzop, self).__init__(
mime=Lzop.MIME,
extension=Lzop.EXTENSION
)
def match(self, buf):
return (len(buf) > 7 and
buf[0] == 0x89 and
buf[1] == 0x4C and
buf[2] == 0x5A and
buf[3] == 0x4F and
buf[4] == 0x00 and
buf[5] == 0x0D and
buf[6] == 0x0A and
buf[7] == 0x1A)
class Lz(Type):
"""
Implements the Lz archive type matcher.
"""
MIME = 'application/x-lzip'
EXTENSION = 'lz'
def __init__(self):
super(Lz, self).__init__(
mime=Lz.MIME,
extension=Lz.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x4C and
buf[1] == 0x5A and
buf[2] == 0x49 and
buf[3] == 0x50)
class Elf(Type):
"""
Implements the Elf archive type matcher
"""
MIME = 'application/x-executable'
EXTENSION = 'elf'
def __init__(self):
super(Elf, self).__init__(
mime=Elf.MIME,
extension=Elf.EXTENSION
)
def match(self, buf):
return (len(buf) > 52 and
buf[0] == 0x7F and
buf[1] == 0x45 and
buf[2] == 0x4C and
buf[3] == 0x46)
class Lz4(Type):
"""
Implements the Lz4 archive type matcher.
"""
MIME = 'application/x-lz4'
EXTENSION = 'lz4'
def __init__(self):
super(Lz4, self).__init__(
mime=Lz4.MIME,
extension=Lz4.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x04 and
buf[1] == 0x22 and
buf[2] == 0x4D and
buf[3] == 0x18)
class Br(Type):
"""Implements the Br image type matcher."""
MIME = 'application/x-brotli'
EXTENSION = 'br'
def __init__(self):
super(Br, self).__init__(
mime=Br.MIME,
extension=Br.EXTENSION
)
def match(self, buf):
return buf[:4] == bytearray([0xce, 0xb2, 0xcf, 0x81])
class Dcm(Type):
"""Implements the Dcm image type matcher."""
MIME = 'application/dicom'
EXTENSION = 'dcm'
def __init__(self):
super(Dcm, self).__init__(
mime=Dcm.MIME,
extension=Dcm.EXTENSION
)
def match(self, buf):
return buf[128:131] == bytearray([0x44, 0x49, 0x43, 0x4d])
class Rpm(Type):
"""Implements the Rpm image type matcher."""
MIME = 'application/x-rpm'
EXTENSION = 'rpm'
def __init__(self):
super(Rpm, self).__init__(
mime=Rpm.MIME,
extension=Rpm.EXTENSION
)
def match(self, buf):
return buf[:4] == bytearray([0xed, 0xab, 0xee, 0xdb])
class Zstd(Type):
"""
Implements the Zstd archive type matcher.
https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
"""
MIME = 'application/zstd'
EXTENSION = 'zst'
MAGIC_SKIPPABLE_START = 0x184D2A50
MAGIC_SKIPPABLE_MASK = 0xFFFFFFF0
def __init__(self):
super(Zstd, self).__init__(
mime=Zstd.MIME,
extension=Zstd.EXTENSION
)
@staticmethod
def _to_little_endian_int(buf):
# return int.from_bytes(buf, byteorder='little')
return struct.unpack('<L', buf)[0]
def match(self, buf):
# Zstandard compressed data is made of one or more frames.
# There are two frame formats defined by Zstandard:
# Zstandard frames and Skippable frames.
# See more details from
# https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-00.html#rfc.section.2
is_zstd = (
len(buf) > 3 and
buf[0] in (0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28) and
buf[1] == 0xb5 and
buf[2] == 0x2f and
buf[3] == 0xfd)
if is_zstd:
return True
# skippable frames
if len(buf) < 8:
return False
magic = self._to_little_endian_int(buf[:4]) & Zstd.MAGIC_SKIPPABLE_MASK
if magic == Zstd.MAGIC_SKIPPABLE_START:
user_data_len = self._to_little_endian_int(buf[4:8])
if len(buf) < 8 + user_data_len:
return False
next_frame = buf[8 + user_data_len:]
return self.match(next_frame)
return False

View file

@ -0,0 +1,212 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
class Midi(Type):
"""
Implements the Midi audio type matcher.
"""
MIME = 'audio/midi'
EXTENSION = 'midi'
def __init__(self):
super(Midi, self).__init__(
mime=Midi.MIME,
extension=Midi.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x4D and
buf[1] == 0x54 and
buf[2] == 0x68 and
buf[3] == 0x64)
class Mp3(Type):
"""
Implements the MP3 audio type matcher.
"""
MIME = 'audio/mpeg'
EXTENSION = 'mp3'
def __init__(self):
super(Mp3, self).__init__(
mime=Mp3.MIME,
extension=Mp3.EXTENSION
)
def match(self, buf):
return (len(buf) > 2 and
((buf[0] == 0x49 and
buf[1] == 0x44 and
buf[2] == 0x33) or
(buf[0] == 0xFF and
buf[1] == 0xF2) or
(buf[0] == 0xFF and
buf[1] == 0xF3) or
(buf[0] == 0xFF and
buf[1] == 0xFB)))
class M4a(Type):
"""
Implements the M4A audio type matcher.
"""
MIME = 'audio/mp4'
EXTENSION = 'm4a'
def __init__(self):
super(M4a, self).__init__(
mime=M4a.MIME,
extension=M4a.EXTENSION
)
def match(self, buf):
return (len(buf) > 10 and
((buf[4] == 0x66 and
buf[5] == 0x74 and
buf[6] == 0x79 and
buf[7] == 0x70 and
buf[8] == 0x4D and
buf[9] == 0x34 and
buf[10] == 0x41) or
(buf[0] == 0x4D and
buf[1] == 0x34 and
buf[2] == 0x41 and
buf[3] == 0x20)))
class Ogg(Type):
"""
Implements the OGG audio type matcher.
"""
MIME = 'audio/ogg'
EXTENSION = 'ogg'
def __init__(self):
super(Ogg, self).__init__(
mime=Ogg.MIME,
extension=Ogg.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x4F and
buf[1] == 0x67 and
buf[2] == 0x67 and
buf[3] == 0x53)
class Flac(Type):
"""
Implements the FLAC audio type matcher.
"""
MIME = 'audio/x-flac'
EXTENSION = 'flac'
def __init__(self):
super(Flac, self).__init__(
mime=Flac.MIME,
extension=Flac.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x66 and
buf[1] == 0x4C and
buf[2] == 0x61 and
buf[3] == 0x43)
class Wav(Type):
"""
Implements the WAV audio type matcher.
"""
MIME = 'audio/x-wav'
EXTENSION = 'wav'
def __init__(self):
super(Wav, self).__init__(
mime=Wav.MIME,
extension=Wav.EXTENSION
)
def match(self, buf):
return (len(buf) > 11 and
buf[0] == 0x52 and
buf[1] == 0x49 and
buf[2] == 0x46 and
buf[3] == 0x46 and
buf[8] == 0x57 and
buf[9] == 0x41 and
buf[10] == 0x56 and
buf[11] == 0x45)
class Amr(Type):
"""
Implements the AMR audio type matcher.
"""
MIME = 'audio/amr'
EXTENSION = 'amr'
def __init__(self):
super(Amr, self).__init__(
mime=Amr.MIME,
extension=Amr.EXTENSION
)
def match(self, buf):
return (len(buf) > 11 and
buf[0] == 0x23 and
buf[1] == 0x21 and
buf[2] == 0x41 and
buf[3] == 0x4D and
buf[4] == 0x52 and
buf[5] == 0x0A)
class Aac(Type):
"""Implements the Aac audio type matcher."""
MIME = 'audio/aac'
EXTENSION = 'aac'
def __init__(self):
super(Aac, self).__init__(
mime=Aac.MIME,
extension=Aac.EXTENSION
)
def match(self, buf):
return (buf[:2] == bytearray([0xff, 0xf1]) or
buf[:2] == bytearray([0xff, 0xf9]))
class Aiff(Type):
"""
Implements the AIFF audio type matcher.
"""
MIME = 'audio/x-aiff'
EXTENSION = 'aiff'
def __init__(self):
super(Aiff, self).__init__(
mime=Aiff.MIME,
extension=Aiff.EXTENSION
)
def match(self, buf):
return (len(buf) > 11 and
buf[0] == 0x46 and
buf[1] == 0x4F and
buf[2] == 0x52 and
buf[3] == 0x4D and
buf[8] == 0x41 and
buf[9] == 0x49 and
buf[10] == 0x46 and
buf[11] == 0x46)

View file

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
class Type(object):
"""
Represents the file type object inherited by
specific file type matchers.
Provides convenient accessor and helper methods.
"""
def __init__(self, mime, extension):
self.__mime = mime
self.__extension = extension
@property
def mime(self):
return self.__mime
@property
def extension(self):
return self.__extension
def is_extension(self, extension):
return self.__extension is extension
def is_mime(self, mime):
return self.__mime is mime
def match(self, buf):
raise NotImplementedError

View file

@ -0,0 +1,256 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
class ZippedDocumentBase(Type):
def match(self, buf):
# start by checking for ZIP local file header signature
idx = self.search_signature(buf, 0, 6000)
if idx != 0:
return
return self.match_document(buf)
def match_document(self, buf):
raise NotImplementedError
def compare_bytes(self, buf, subslice, start_offset):
sl = len(subslice)
if start_offset + sl > len(buf):
return False
return buf[start_offset:start_offset + sl] == subslice
def search_signature(self, buf, start, rangeNum):
signature = b"PK\x03\x04"
length = len(buf)
end = start + rangeNum
end = length if end > length else end
if start >= end:
return -1
try:
return buf.index(signature, start, end)
except ValueError:
return -1
class OpenDocument(ZippedDocumentBase):
def match_document(self, buf):
# Check if first file in archive is the identifying file
if not self.compare_bytes(buf, b"mimetype", 0x1E):
return
# Check content of mimetype file if it matches current mime
return self.compare_bytes(buf, bytes(self.mime, "ASCII"), 0x26)
class OfficeOpenXml(ZippedDocumentBase):
def match_document(self, buf):
# Check if first file in archive is the identifying file
ft = self.match_filename(buf, 0x1E)
if ft:
return ft
# Otherwise check that the fist file is one of these
if (
not self.compare_bytes(buf, b"[Content_Types].xml", 0x1E)
and not self.compare_bytes(buf, b"_rels/.rels", 0x1E)
and not self.compare_bytes(buf, b"docProps", 0x1E)
):
return
# Loop through next 3 files and check if they match
# NOTE: OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
# https://github.com/h2non/filetype/blob/d730d98ad5c990883148485b6fd5adbdd378364a/matchers/document.go#L134
idx = 0
for i in range(4):
# Search for next file header
idx = self.search_signature(buf, idx + 4, 6000)
if idx == -1:
return
# Filename is at file header + 30
ft = self.match_filename(buf, idx + 30)
if ft:
return ft
def match_filename(self, buf, offset):
if self.compare_bytes(buf, b"word/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if self.compare_bytes(buf, b"ppt/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
if self.compare_bytes(buf, b"xl/", offset):
return (
self.mime
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
class Doc(Type):
"""
Implements the Microsoft Word (Office 97-2003) document type matcher.
"""
MIME = "application/msword"
EXTENSION = "doc"
def __init__(self):
super(Doc, self).__init__(mime=Doc.MIME, extension=Doc.EXTENSION)
def match(self, buf):
if len(buf) > 515 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:516] == b"\xEC\xA5\xC1\x00":
return True
if (
len(buf) > 2142
and b"\x00\x0A\x00\x00\x00MSWordDoc\x00\x10\x00\x00\x00Word.Document.8\x00\xF49\xB2q"
in buf[2075:2142]
):
return True
return False
class Docx(OfficeOpenXml):
"""
Implements the Microsoft Word OOXML (Office 2007+) document type matcher.
"""
MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
EXTENSION = "docx"
def __init__(self):
super(Docx, self).__init__(mime=Docx.MIME, extension=Docx.EXTENSION)
class Odt(OpenDocument):
"""
Implements the OpenDocument Text document type matcher.
"""
MIME = "application/vnd.oasis.opendocument.text"
EXTENSION = "odt"
def __init__(self):
super(Odt, self).__init__(mime=Odt.MIME, extension=Odt.EXTENSION)
class Xls(Type):
"""
Implements the Microsoft Excel (Office 97-2003) document type matcher.
"""
MIME = "application/vnd.ms-excel"
EXTENSION = "xls"
def __init__(self):
super(Xls, self).__init__(mime=Xls.MIME, extension=Xls.EXTENSION)
def match(self, buf):
if len(buf) > 520 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:516] == b"\xFD\xFF\xFF\xFF" and (
buf[518] == 0x00 or buf[518] == 0x02
):
return True
if buf[512:520] == b"\x09\x08\x10\x00\x00\x06\x05\x00":
return True
if (
len(buf) > 2095
and b"\xE2\x00\x00\x00\x5C\x00\x70\x00\x04\x00\x00Calc"
in buf[1568:2095]
):
return True
return False
class Xlsx(OfficeOpenXml):
"""
Implements the Microsoft Excel OOXML (Office 2007+) document type matcher.
"""
MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
EXTENSION = "xlsx"
def __init__(self):
super(Xlsx, self).__init__(mime=Xlsx.MIME, extension=Xlsx.EXTENSION)
class Ods(OpenDocument):
"""
Implements the OpenDocument Spreadsheet document type matcher.
"""
MIME = "application/vnd.oasis.opendocument.spreadsheet"
EXTENSION = "ods"
def __init__(self):
super(Ods, self).__init__(mime=Ods.MIME, extension=Ods.EXTENSION)
class Ppt(Type):
"""
Implements the Microsoft PowerPoint (Office 97-2003) document type matcher.
"""
MIME = "application/vnd.ms-powerpoint"
EXTENSION = "ppt"
def __init__(self):
super(Ppt, self).__init__(mime=Ppt.MIME, extension=Ppt.EXTENSION)
def match(self, buf):
if len(buf) > 524 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
if buf[512:516] == b"\xA0\x46\x1D\xF0":
return True
if buf[512:516] == b"\x00\x6E\x1E\xF0":
return True
if buf[512:516] == b"\x0F\x00\xE8\x03":
return True
if buf[512:516] == b"\xFD\xFF\xFF\xFF" and buf[522:524] == b"\x00\x00":
return True
if (
len(buf) > 2096
and buf[2072:2096]
== b"\x00\xB9\x29\xE8\x11\x00\x00\x00MS PowerPoint 97"
):
return True
return False
class Pptx(OfficeOpenXml):
"""
Implements the Microsoft PowerPoint OOXML (Office 2007+) document type matcher.
"""
MIME = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
EXTENSION = "pptx"
def __init__(self):
super(Pptx, self).__init__(mime=Pptx.MIME, extension=Pptx.EXTENSION)
class Odp(OpenDocument):
"""
Implements the OpenDocument Presentation document type matcher.
"""
MIME = "application/vnd.oasis.opendocument.presentation"
EXTENSION = "odp"
def __init__(self):
super(Odp, self).__init__(mime=Odp.MIME, extension=Odp.EXTENSION)

115
libs/filetype/types/font.py Normal file
View file

@ -0,0 +1,115 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
class Woff(Type):
"""
Implements the WOFF font type matcher.
"""
MIME = 'application/font-woff'
EXTENSION = 'woff'
def __init__(self):
super(Woff, self).__init__(
mime=Woff.MIME,
extension=Woff.EXTENSION
)
def match(self, buf):
return (len(buf) > 7 and
buf[0] == 0x77 and
buf[1] == 0x4F and
buf[2] == 0x46 and
buf[3] == 0x46 and
((buf[4] == 0x00 and
buf[5] == 0x01 and
buf[6] == 0x00 and
buf[7] == 0x00) or
(buf[4] == 0x4F and
buf[5] == 0x54 and
buf[6] == 0x54 and
buf[7] == 0x4F) or
(buf[4] == 0x74 and
buf[5] == 0x72 and
buf[6] == 0x75 and
buf[7] == 0x65)))
class Woff2(Type):
"""
Implements the WOFF2 font type matcher.
"""
MIME = 'application/font-woff'
EXTENSION = 'woff2'
def __init__(self):
super(Woff2, self).__init__(
mime=Woff2.MIME,
extension=Woff2.EXTENSION
)
def match(self, buf):
return (len(buf) > 7 and
buf[0] == 0x77 and
buf[1] == 0x4F and
buf[2] == 0x46 and
buf[3] == 0x32 and
((buf[4] == 0x00 and
buf[5] == 0x01 and
buf[6] == 0x00 and
buf[7] == 0x00) or
(buf[4] == 0x4F and
buf[5] == 0x54 and
buf[6] == 0x54 and
buf[7] == 0x4F) or
(buf[4] == 0x74 and
buf[5] == 0x72 and
buf[6] == 0x75 and
buf[7] == 0x65)))
class Ttf(Type):
"""
Implements the TTF font type matcher.
"""
MIME = 'application/font-sfnt'
EXTENSION = 'ttf'
def __init__(self):
super(Ttf, self).__init__(
mime=Ttf.MIME,
extension=Ttf.EXTENSION
)
def match(self, buf):
return (len(buf) > 4 and
buf[0] == 0x00 and
buf[1] == 0x01 and
buf[2] == 0x00 and
buf[3] == 0x00 and
buf[4] == 0x00)
class Otf(Type):
"""
Implements the OTF font type matcher.
"""
MIME = 'application/font-sfnt'
EXTENSION = 'otf'
def __init__(self):
super(Otf, self).__init__(
mime=Otf.MIME,
extension=Otf.EXTENSION
)
def match(self, buf):
return (len(buf) > 4 and
buf[0] == 0x4F and
buf[1] == 0x54 and
buf[2] == 0x54 and
buf[3] == 0x4F and
buf[4] == 0x00)

View file

@ -0,0 +1,383 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
from .isobmff import IsoBmff
class Jpeg(Type):
"""
Implements the JPEG image type matcher.
"""
MIME = 'image/jpeg'
EXTENSION = 'jpg'
def __init__(self):
super(Jpeg, self).__init__(
mime=Jpeg.MIME,
extension=Jpeg.EXTENSION
)
def match(self, buf):
return (len(buf) > 2 and
buf[0] == 0xFF and
buf[1] == 0xD8 and
buf[2] == 0xFF)
class Jpx(Type):
"""
Implements the JPEG2000 image type matcher.
"""
MIME = "image/jpx"
EXTENSION = "jpx"
def __init__(self):
super(Jpx, self).__init__(mime=Jpx.MIME, extension=Jpx.EXTENSION)
def match(self, buf):
return (
len(buf) > 50
and buf[0] == 0x00
and buf[1] == 0x00
and buf[2] == 0x00
and buf[3] == 0x0C
and buf[16:24] == b"ftypjp2 "
)
class Apng(Type):
"""
Implements the APNG image type matcher.
"""
MIME = 'image/apng'
EXTENSION = 'apng'
def __init__(self):
super(Apng, self).__init__(
mime=Apng.MIME,
extension=Apng.EXTENSION
)
def match(self, buf):
if (len(buf) > 8 and
buf[:8] == bytearray([0x89, 0x50, 0x4e, 0x47,
0x0d, 0x0a, 0x1a, 0x0a])):
# cursor in buf, skip already readed 8 bytes
i = 8
while len(buf) > i:
data_length = int.from_bytes(buf[i:i+4], byteorder="big")
i += 4
chunk_type = buf[i:i+4].decode("ascii", errors='ignore')
i += 4
# acTL chunk in APNG should appears first than IDAT
# IEND is end of PNG
if (chunk_type == "IDAT" or chunk_type == "IEND"):
return False
elif (chunk_type == "acTL"):
return True
# move to the next chunk by skipping data and crc (4 bytes)
i += data_length + 4
return False
class Png(Type):
"""
Implements the PNG image type matcher.
"""
MIME = 'image/png'
EXTENSION = 'png'
def __init__(self):
super(Png, self).__init__(
mime=Png.MIME,
extension=Png.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x89 and
buf[1] == 0x50 and
buf[2] == 0x4E and
buf[3] == 0x47)
class Gif(Type):
"""
Implements the GIF image type matcher.
"""
MIME = 'image/gif'
EXTENSION = 'gif'
def __init__(self):
super(Gif, self).__init__(
mime=Gif.MIME,
extension=Gif.EXTENSION,
)
def match(self, buf):
return (len(buf) > 2 and
buf[0] == 0x47 and
buf[1] == 0x49 and
buf[2] == 0x46)
class Webp(Type):
"""
Implements the WEBP image type matcher.
"""
MIME = 'image/webp'
EXTENSION = 'webp'
def __init__(self):
super(Webp, self).__init__(
mime=Webp.MIME,
extension=Webp.EXTENSION,
)
def match(self, buf):
return (len(buf) > 13 and
buf[0] == 0x52 and
buf[1] == 0x49 and
buf[2] == 0x46 and
buf[3] == 0x46 and
buf[8] == 0x57 and
buf[9] == 0x45 and
buf[10] == 0x42 and
buf[11] == 0x50 and
buf[12] == 0x56 and
buf[13] == 0x50)
class Cr2(Type):
"""
Implements the CR2 image type matcher.
"""
MIME = 'image/x-canon-cr2'
EXTENSION = 'cr2'
def __init__(self):
super(Cr2, self).__init__(
mime=Cr2.MIME,
extension=Cr2.EXTENSION,
)
def match(self, buf):
return (len(buf) > 9 and
((buf[0] == 0x49 and buf[1] == 0x49 and
buf[2] == 0x2A and buf[3] == 0x0) or
(buf[0] == 0x4D and buf[1] == 0x4D and
buf[2] == 0x0 and buf[3] == 0x2A)) and
buf[8] == 0x43 and buf[9] == 0x52)
class Tiff(Type):
"""
Implements the TIFF image type matcher.
"""
MIME = 'image/tiff'
EXTENSION = 'tif'
def __init__(self):
super(Tiff, self).__init__(
mime=Tiff.MIME,
extension=Tiff.EXTENSION,
)
def match(self, buf):
return (len(buf) > 9 and
((buf[0] == 0x49 and buf[1] == 0x49 and
buf[2] == 0x2A and buf[3] == 0x0) or
(buf[0] == 0x4D and buf[1] == 0x4D and
buf[2] == 0x0 and buf[3] == 0x2A))
and not (buf[8] == 0x43 and buf[9] == 0x52))
class Bmp(Type):
"""
Implements the BMP image type matcher.
"""
MIME = 'image/bmp'
EXTENSION = 'bmp'
def __init__(self):
super(Bmp, self).__init__(
mime=Bmp.MIME,
extension=Bmp.EXTENSION,
)
def match(self, buf):
return (len(buf) > 1 and
buf[0] == 0x42 and
buf[1] == 0x4D)
class Jxr(Type):
"""
Implements the JXR image type matcher.
"""
MIME = 'image/vnd.ms-photo'
EXTENSION = 'jxr'
def __init__(self):
super(Jxr, self).__init__(
mime=Jxr.MIME,
extension=Jxr.EXTENSION,
)
def match(self, buf):
return (len(buf) > 2 and
buf[0] == 0x49 and
buf[1] == 0x49 and
buf[2] == 0xBC)
class Psd(Type):
"""
Implements the PSD image type matcher.
"""
MIME = 'image/vnd.adobe.photoshop'
EXTENSION = 'psd'
def __init__(self):
super(Psd, self).__init__(
mime=Psd.MIME,
extension=Psd.EXTENSION,
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x38 and
buf[1] == 0x42 and
buf[2] == 0x50 and
buf[3] == 0x53)
class Ico(Type):
"""
Implements the ICO image type matcher.
"""
MIME = 'image/x-icon'
EXTENSION = 'ico'
def __init__(self):
super(Ico, self).__init__(
mime=Ico.MIME,
extension=Ico.EXTENSION,
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x00 and
buf[1] == 0x00 and
buf[2] == 0x01 and
buf[3] == 0x00)
class Heic(IsoBmff):
"""
Implements the HEIC image type matcher.
"""
MIME = 'image/heic'
EXTENSION = 'heic'
def __init__(self):
super(Heic, self).__init__(
mime=Heic.MIME,
extension=Heic.EXTENSION
)
def match(self, buf):
if not self._is_isobmff(buf):
return False
major_brand, minor_version, compatible_brands = self._get_ftyp(buf)
if major_brand == 'heic':
return True
if major_brand in ['mif1', 'msf1'] and 'heic' in compatible_brands:
return True
return False
class Dcm(Type):
MIME = 'application/dicom'
EXTENSION = 'dcm'
OFFSET = 128
def __init__(self):
super(Dcm, self).__init__(
mime=Dcm.MIME,
extension=Dcm.EXTENSION
)
def match(self, buf):
return (len(buf) > Dcm.OFFSET + 4 and
buf[Dcm.OFFSET + 0] == 0x44 and
buf[Dcm.OFFSET + 1] == 0x49 and
buf[Dcm.OFFSET + 2] == 0x43 and
buf[Dcm.OFFSET + 3] == 0x4D)
class Dwg(Type):
"""Implements the Dwg image type matcher."""
MIME = 'image/vnd.dwg'
EXTENSION = 'dwg'
def __init__(self):
super(Dwg, self).__init__(
mime=Dwg.MIME,
extension=Dwg.EXTENSION
)
def match(self, buf):
return buf[:4] == bytearray([0x41, 0x43, 0x31, 0x30])
class Xcf(Type):
"""Implements the Xcf image type matcher."""
MIME = 'image/x-xcf'
EXTENSION = 'xcf'
def __init__(self):
super(Xcf, self).__init__(
mime=Xcf.MIME,
extension=Xcf.EXTENSION
)
def match(self, buf):
return buf[:10] == bytearray([0x67, 0x69, 0x6d, 0x70, 0x20,
0x78, 0x63, 0x66, 0x20, 0x76])
class Avif(IsoBmff):
"""
Implements the AVIF image type matcher.
"""
MIME = 'image/avif'
EXTENSION = 'avif'
def __init__(self):
super(Avif, self).__init__(
mime=Avif.MIME,
extension=Avif.EXTENSION
)
def match(self, buf):
if not self._is_isobmff(buf):
return False
major_brand, minor_version, compatible_brands = self._get_ftyp(buf)
if major_brand == 'avif':
return True
if major_brand in ['mif1', 'msf1'] and 'avif' in compatible_brands:
return True
return False

View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import codecs
from .base import Type
class IsoBmff(Type):
"""
Implements the ISO-BMFF base type.
"""
def __init__(self, mime, extension):
super(IsoBmff, self).__init__(
mime=mime,
extension=extension
)
def _is_isobmff(self, buf):
if len(buf) < 16 or buf[4:8] != b'ftyp':
return False
if len(buf) < int(codecs.encode(buf[0:4], 'hex'), 16):
return False
return True
def _get_ftyp(self, buf):
ftyp_len = int(codecs.encode(buf[0:4], 'hex'), 16)
major_brand = buf[8:12].decode(errors='ignore')
minor_version = int(codecs.encode(buf[12:16], 'hex'), 16)
compatible_brands = []
for i in range(16, ftyp_len, 4):
compatible_brands.append(buf[i:i+4].decode(errors='ignore'))
return major_brand, minor_version, compatible_brands

View file

@ -0,0 +1,223 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .base import Type
from .isobmff import IsoBmff
class Mp4(IsoBmff):
"""
Implements the MP4 video type matcher.
"""
MIME = 'video/mp4'
EXTENSION = 'mp4'
def __init__(self):
super(Mp4, self).__init__(
mime=Mp4.MIME,
extension=Mp4.EXTENSION
)
def match(self, buf):
if not self._is_isobmff(buf):
return False
major_brand, minor_version, compatible_brands = self._get_ftyp(buf)
for brand in compatible_brands:
if brand in ['mp41', 'mp42', 'isom']:
return True
return major_brand in ['mp41', 'mp42', 'isom']
class M4v(Type):
"""
Implements the M4V video type matcher.
"""
MIME = 'video/x-m4v'
EXTENSION = 'm4v'
def __init__(self):
super(M4v, self).__init__(
mime=M4v.MIME,
extension=M4v.EXTENSION
)
def match(self, buf):
return (len(buf) > 10 and
buf[0] == 0x0 and buf[1] == 0x0 and
buf[2] == 0x0 and buf[3] == 0x1C and
buf[4] == 0x66 and buf[5] == 0x74 and
buf[6] == 0x79 and buf[7] == 0x70 and
buf[8] == 0x4D and buf[9] == 0x34 and
buf[10] == 0x56)
class Mkv(Type):
"""
Implements the MKV video type matcher.
"""
MIME = 'video/x-matroska'
EXTENSION = 'mkv'
def __init__(self):
super(Mkv, self).__init__(
mime=Mkv.MIME,
extension=Mkv.EXTENSION
)
def match(self, buf):
contains_ebml_element = buf.startswith(b'\x1A\x45\xDF\xA3')
contains_doctype_element = buf.find(b'\x42\x82\x88matroska') > -1
return contains_ebml_element and contains_doctype_element
class Webm(Type):
"""
Implements the WebM video type matcher.
"""
MIME = 'video/webm'
EXTENSION = 'webm'
def __init__(self):
super(Webm, self).__init__(
mime=Webm.MIME,
extension=Webm.EXTENSION
)
def match(self, buf):
contains_ebml_element = buf.startswith(b'\x1A\x45\xDF\xA3')
contains_doctype_element = buf.find(b'\x42\x82\x84webm') > -1
return contains_ebml_element and contains_doctype_element
class Mov(IsoBmff):
"""
Implements the MOV video type matcher.
"""
MIME = 'video/quicktime'
EXTENSION = 'mov'
def __init__(self):
super(Mov, self).__init__(
mime=Mov.MIME,
extension=Mov.EXTENSION
)
def match(self, buf):
if not self._is_isobmff(buf):
return False
major_brand, minor_version, compatible_brands = self._get_ftyp(buf)
return major_brand == 'qt '
class Avi(Type):
"""
Implements the AVI video type matcher.
"""
MIME = 'video/x-msvideo'
EXTENSION = 'avi'
def __init__(self):
super(Avi, self).__init__(
mime=Avi.MIME,
extension=Avi.EXTENSION
)
def match(self, buf):
return (len(buf) > 11 and
buf[0] == 0x52 and
buf[1] == 0x49 and
buf[2] == 0x46 and
buf[3] == 0x46 and
buf[8] == 0x41 and
buf[9] == 0x56 and
buf[10] == 0x49 and
buf[11] == 0x20)
class Wmv(Type):
"""
Implements the WMV video type matcher.
"""
MIME = 'video/x-ms-wmv'
EXTENSION = 'wmv'
def __init__(self):
super(Wmv, self).__init__(
mime=Wmv.MIME,
extension=Wmv.EXTENSION
)
def match(self, buf):
return (len(buf) > 9 and
buf[0] == 0x30 and
buf[1] == 0x26 and
buf[2] == 0xB2 and
buf[3] == 0x75 and
buf[4] == 0x8E and
buf[5] == 0x66 and
buf[6] == 0xCF and
buf[7] == 0x11 and
buf[8] == 0xA6 and
buf[9] == 0xD9)
class Flv(Type):
"""
Implements the FLV video type matcher.
"""
MIME = 'video/x-flv'
EXTENSION = 'flv'
def __init__(self):
super(Flv, self).__init__(
mime=Flv.MIME,
extension=Flv.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x46 and
buf[1] == 0x4C and
buf[2] == 0x56 and
buf[3] == 0x01)
class Mpeg(Type):
"""
Implements the MPEG video type matcher.
"""
MIME = 'video/mpeg'
EXTENSION = 'mpg'
def __init__(self):
super(Mpeg, self).__init__(
mime=Mpeg.MIME,
extension=Mpeg.EXTENSION
)
def match(self, buf):
return (len(buf) > 3 and
buf[0] == 0x0 and
buf[1] == 0x0 and
buf[2] == 0x1 and
buf[3] >= 0xb0 and
buf[3] <= 0xbf)
class M3gp(Type):
"""Implements the 3gp image type matcher."""
MIME = 'video/3gpp'
EXTENSION = '3gp'
def __init__(self):
super(M3gp, self).__init__(
mime=M3gp.MIME,
extension=M3gp.EXTENSION
)
def match(self, buf):
return buf[:7] == bytearray([0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70])

84
libs/filetype/utils.py Normal file
View file

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# Python 2.7 workaround
try:
import pathlib
except ImportError:
pass
_NUM_SIGNATURE_BYTES = 8192
def get_signature_bytes(path):
"""
Reads file from disk and returns the first 8192 bytes
of data representing the magic number header signature.
Args:
path: path string to file.
Returns:
First 8192 bytes of the file content as bytearray type.
"""
with open(path, 'rb') as fp:
return bytearray(fp.read(_NUM_SIGNATURE_BYTES))
def signature(array):
"""
Returns the first 8192 bytes of the given bytearray
as part of the file header signature.
Args:
array: bytearray to extract the header signature.
Returns:
First 8192 bytes of the file content as bytearray type.
"""
length = len(array)
index = _NUM_SIGNATURE_BYTES if length > _NUM_SIGNATURE_BYTES else length
return array[:index]
def get_bytes(obj):
"""
Infers the input type and reads the first 8192 bytes,
returning a sliced bytearray.
Args:
obj: path to readable, file-like object(with read() method), bytes,
bytearray or memoryview
Returns:
First 8192 bytes of the file content as bytearray type.
Raises:
TypeError: if obj is not a supported type.
"""
if isinstance(obj, bytearray):
return signature(obj)
if isinstance(obj, str):
return get_signature_bytes(obj)
if isinstance(obj, bytes):
return signature(obj)
if isinstance(obj, memoryview):
return bytearray(signature(obj).tolist())
if isinstance(obj, pathlib.PurePath):
return get_signature_bytes(obj)
if hasattr(obj, 'read'):
if hasattr(obj, 'tell') and hasattr(obj, 'seek'):
start_pos = obj.tell()
obj.seek(0)
magic_bytes = obj.read(_NUM_SIGNATURE_BYTES)
obj.seek(start_pos)
return get_bytes(magic_bytes)
return get_bytes(obj.read(_NUM_SIGNATURE_BYTES))
raise TypeError('Unsupported type as file input: %s' % type(obj))

View file

@ -60,6 +60,9 @@ tzlocal==5.2
# Required-by: beautifulsoup4
soupsieve==2.3.2.post1
# Required-by: deathbycaptcha
filetype==1.2.0
# Required-by: ffsubsync
auditok<=0.1.5 # do not upgrade unless ffsubsync requirements.txt change
ffmpeg-python==0.2.0