configuration/packages/trafilatura.nix
2023-01-12 14:24:27 -08:00

59 lines
1.7 KiB
Nix

{ lib
, python3
}:
let
fetch = pname: { version, license, hash, buildInputs, propagatedBuildInputs }: python3.pkgs.buildPythonPackage {
inherit pname version buildInputs propagatedBuildInputs;
src = python3.pkgs.fetchPypi { inherit pname version hash; };
meta = { inherit license; };
};
in
python3.pkgs.buildPythonApplication rec {
pname = "trafilatura";
version = "1.3.0";
src = python3.pkgs.fetchPypi {
inherit pname version;
hash = "sha256-pmGJ5LnVkdzmSPDMeftSpIbmeXCAkBibxPzYgGjwle8=";
};
propagatedBuildInputs = with python3.pkgs; [
certifi
charset-normalizer
lxml
urllib3
(fetch "courlan" {
version = "0.8.3";
hash = "sha256-0GxbBIsrXNXArHcwTcJLeV5Lslentgd+pAWjtema4Xk=";
license = lib.licenses.gpl3Plus;
buildInputs = [ pytest ];
propagatedBuildInputs = [ langcodes tld urllib3 ];
})
(fetch "htmldate" {
version = "1.3.0";
hash = "sha256-PDLNtpOYJRMUk7ITxWnb5NCF14UilBfpIvkHC2eoDQY=";
license = lib.licenses.gpl3Plus;
buildInputs = [ pytest ];
propagatedBuildInputs = [ charset-normalizer dateparser lxml python-dateutil urllib3 ];
})
(fetch "jusText" {
version = "3.0.0";
hash = "sha256-dkDiSCGHlfa+ZfbDX+aXMloygPy0Z10VJbzf8rhvqt8=";
license = lib.licenses.bsd2;
buildInputs = [ coverage pytest pytest-cov ];
propagatedBuildInputs = [ lxml ];
})
];
doCheck = false; # GUI tests fail with no X display
meta = {
description = "Python package and command-line tool designed to gather text on the Web, including discovery, extraction and text processing components";
homepage = "https://github.com/adbar/trafilatura";
license = lib.licenses.gpl3;
};
}