[project]
name = 'llm-data-hub'
dynamic = ["version"]
description = 'This repository centralizes and organizes dataset management for large language model training.'
authors = [
]
maintainers = [
]
readme = 'README.md'
requires-python = '>=3.9'
classifiers = [
    'Intended Audience :: Science/Research',
    'Intended Audience :: Developers',
    'Operating System :: OS Independent',
    'Private :: Do Not Upload',
    'Programming Language :: Python :: 3',
    'Programming Language :: Python :: 3.10',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'Typing :: Typed'
]
dependencies = [
    'datasets',
    'fire',
    'pandas',
    'configue',
    'nltk',
    'rich',
    'transformers[sentencepiece]',
    'gcsfs',
    'tabulate',
    'zstandard'
]

[project.optional-dependencies]
dev = [
    "black>=22.0.0",
    "coverage>=7.0.0",
    "mypy>=1.0.0",
    "pytest>=7.0.0",
    "ruff>=0.0.257"
]
evaluation = [
    "sacrebleu",
    "rouge-score"
]

audio = [
    "openai-whisper",
]

construction = [
    "apache-beam",
    "zstandard",
    "configue"
]

perplexity = [
    # "git+https://github.com/kpu/kenlm/archive/master.zip",
    "sentencepiece"
]

[project.urls]
homepage   = "github.com"

[build-system]
requires      = ['setuptools', 'setuptools_scm[toml]', 'wheel']
build-backend = 'setuptools.build_meta'

[tool.setuptools_scm]
fallback_version = '0.0.0-dev'

[tool.setuptools]
zip-safe  = false
platforms = ['any']

[tool.setuptools.packages.find]
include = ['dataset_collection', 'dataset_collection.*',
    'dataset_preprocessing', 'dataset_preprocessing.*',
    'dataset_construction', 'dataset_construction.*',
    'utils', 'utils.*',
    'evaluation', 'evaluation.*']

[tool.mypy]
check_untyped_defs    = true
disallow_untyped_defs = true
enable_error_code     = ['ignore-without-code']
exclude               = ['docs/']
mypy_path             = '$MYPY_CONFIG_FILE_DIR/typings'
no_implicit_optional  = true
show_error_codes      = true
warn_redundant_casts  = true
warn_return_any       = true
warn_unused_configs   = true
warn_unused_ignores   = true
warn_unreachable      = true

[[tool.mypy.overrides]]
module = [
    'datasets',
    'datasets.*',
    'fire',
    'pandas',
    'tqdm.*',
    'backoff',
    'configue',
    'nltk.*',
    'rouge_score',
    'transformers',
    'transformers.*',
    'jsonlines'
]
ignore_missing_imports = true

[tool.coverage.run]
include = ['tasks/*', 'utils/*', 'scripts/*', 'evaluation/*']

[tool.coverage.report]
exclude_lines = [
    'pragma: no cover',
    'raise NotImplementedError',
    'if __name__ == "__main__":',
    'if TYPE_CHECKING:',
    'def __repr__'
]

[tool.black]
line-length = 120

[tool.ruff]
select      = ["E", "F", "W", "I", "N"]
line-length = 120

[tool.ruff.per-file-ignores]
'__init__.py' = ["F401"]
