2017-04-08 02:43:18 +02:00
|
|
|
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
|
|
|
|
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
2017-04-07 23:05:04 +02:00
|
|
|
|
|
|
|
# Supported list of languages or `null' for all available languages
|
|
|
|
, enableLanguages ? null
|
2015-05-22 07:45:59 +02:00
|
|
|
}:
|
|
|
|
|
2013-06-11 19:22:30 +02:00
|
|
|
stdenv.mkDerivation rec {
|
|
|
|
name = "tesseract-${version}";
|
2017-04-08 02:43:18 +02:00
|
|
|
version = "3.05.00";
|
2011-04-24 20:04:07 +02:00
|
|
|
|
2016-12-19 13:05:30 +01:00
|
|
|
src = fetchFromGitHub {
|
|
|
|
owner = "tesseract-ocr";
|
|
|
|
repo = "tesseract";
|
|
|
|
rev = version;
|
2017-04-08 02:43:18 +02:00
|
|
|
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
|
2011-04-24 20:04:07 +02:00
|
|
|
};
|
|
|
|
|
2016-12-19 13:05:30 +01:00
|
|
|
tessdata = fetchFromGitHub {
|
|
|
|
owner = "tesseract-ocr";
|
|
|
|
repo = "tessdata";
|
|
|
|
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
|
|
|
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
|
|
|
};
|
2012-06-11 12:28:28 +02:00
|
|
|
|
2017-04-08 02:43:18 +02:00
|
|
|
nativeBuildInputs = [ pkgconfig autoreconfHook ];
|
2016-12-19 13:05:30 +01:00
|
|
|
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
|
2016-02-20 23:33:10 +01:00
|
|
|
|
2016-12-19 13:05:30 +01:00
|
|
|
LIBLEPT_HEADERSDIR = "${leptonica}/include";
|
2011-04-24 20:04:07 +02:00
|
|
|
|
2017-04-07 23:05:04 +02:00
|
|
|
# Copy the .traineddata files of the languages specified in enableLanguages
|
|
|
|
# into `$out/share/tessdata' and check afterwards if copying was successful.
|
|
|
|
postInstall = let
|
|
|
|
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
|
|
|
|
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
|
|
|
|
findLangArgs = if enableLanguages != null
|
|
|
|
then "\\( ${mkFindArgs enableLanguages} \\)"
|
|
|
|
else "-iname '*.traineddata'";
|
|
|
|
in ''
|
|
|
|
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
|
|
|
|
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
|
|
|
|
|
|
|
|
${if enableLanguages != null then ''
|
|
|
|
expected=${toString (builtins.length enableLanguages)}
|
|
|
|
'' else ''
|
|
|
|
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
|
|
|
|
''}
|
|
|
|
|
|
|
|
if [ "$numLangs" -ne "$expected" ]; then
|
|
|
|
echo "Expected $expected languages, but $numLangs" \
|
|
|
|
"were copied to \`$out/share/tessdata'" >&2
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
'';
|
2011-04-24 22:01:19 +02:00
|
|
|
|
2011-04-24 20:04:07 +02:00
|
|
|
meta = {
|
|
|
|
description = "OCR engine";
|
2017-08-22 20:50:04 +02:00
|
|
|
homepage = https://github.com/tesseract-ocr/tesseract;
|
2014-06-19 06:19:00 +02:00
|
|
|
license = stdenv.lib.licenses.asl20;
|
2011-04-24 20:04:07 +02:00
|
|
|
maintainers = with stdenv.lib.maintainers; [viric];
|
2017-04-21 07:10:52 +02:00
|
|
|
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
2011-04-24 20:04:07 +02:00
|
|
|
};
|
|
|
|
}
|