tree-sitter: improve update script to fetch all available grammars
The update script would only fetch the few grammars listed in the tree-sitter repository previously. But the tree-sitter github orga has a rather large amount of officially supported grammars. Thus we change the script to query the github APIs for repositories instead (up to 100 this is supported without paging). Since the repository list also contains some that are not grammars, there is a bash script which lists all repos we are aware of and the ones we want to ignore. It will make sure we don’t forget any repositories in the future, by comparing to the actual list with jq.
This commit is contained in:
parent
a629257ec5
commit
c0a4b41afe
1 changed files with 86 additions and 8 deletions
|
@ -3,11 +3,70 @@
|
|||
, src }:
|
||||
|
||||
let
|
||||
# print all the grammar names mentioned in the fetch-fixtures script
|
||||
getGrammarNames = writeShellScript "get-grammars.sh" ''
|
||||
# check in the list of grammars, whether we know all of them.
|
||||
checkKnownGrammars = writeShellScript "get-grammars.sh" ''
|
||||
set -euo pipefail
|
||||
sed -ne 's/^fetch_grammar \(\S*\).*$/\1/p' \
|
||||
${src}/script/fetch-fixtures
|
||||
known='
|
||||
[ "tree-sitter-javascript"
|
||||
, "tree-sitter-c"
|
||||
, "tree-sitter-swift"
|
||||
, "tree-sitter-json"
|
||||
, "tree-sitter-cpp"
|
||||
, "tree-sitter-ruby"
|
||||
, "tree-sitter-razor"
|
||||
, "tree-sitter-go"
|
||||
, "tree-sitter-c-sharp"
|
||||
, "tree-sitter-python"
|
||||
, "tree-sitter-typescript"
|
||||
, "tree-sitter-rust"
|
||||
, "tree-sitter-bash"
|
||||
, "tree-sitter-php"
|
||||
, "tree-sitter-java"
|
||||
, "tree-sitter-scala"
|
||||
, "tree-sitter-ocaml"
|
||||
, "tree-sitter-julia"
|
||||
, "tree-sitter-agda"
|
||||
, "tree-sitter-fluent"
|
||||
, "tree-sitter-html"
|
||||
, "tree-sitter-haskell"
|
||||
, "tree-sitter-regex"
|
||||
, "tree-sitter-css"
|
||||
, "tree-sitter-verilog"
|
||||
, "tree-sitter-jsdoc"
|
||||
, "tree-sitter-ql"
|
||||
]'
|
||||
ignore='
|
||||
[ "tree-sitter"
|
||||
, "tree-sitter-cli"
|
||||
, "tree-sitter-embedded-template"
|
||||
${/*this is the haskell language bindings, tree-sitter-haskell is the grammar*/""}
|
||||
, "haskell-tree-sitter"
|
||||
${/*this is the ruby language bindings, tree-sitter-ruby is the grammar*/""}
|
||||
, "ruby-tree-sitter"
|
||||
${/*this is the (unmaintained) rust language bindings, tree-sitter-rust is the grammar*/""}
|
||||
, "rust-tree-sitter"
|
||||
${/*this is the nodejs language bindings, tree-sitter-javascript is the grammar*/""}
|
||||
, "node-tree-sitter"
|
||||
${/*this is the python language bindings, tree-sitter-python is the grammar*/""}
|
||||
, "py-tree-sitter"
|
||||
${/*afl fuzzing for tree sitter*/""}
|
||||
, "afl-tree-sitter"
|
||||
${/*archived*/""}
|
||||
, "highlight-schema"
|
||||
${/*website*/""}
|
||||
, "tree-sitter.github.io"
|
||||
]'
|
||||
res=$(${jq}/bin/jq \
|
||||
--argjson known "$known" \
|
||||
--argjson ignore "$ignore" \
|
||||
'. - ($known + $ignore)' \
|
||||
)
|
||||
if [ ! "$res" == "[]" ]; then
|
||||
echo "These repositories are neither known nor ignored:" 1>&2
|
||||
echo "$res" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
printf '%s' "$known"
|
||||
'';
|
||||
|
||||
# TODO
|
||||
|
@ -22,7 +81,7 @@ let
|
|||
res=$(${curl}/bin/curl \
|
||||
--silent \
|
||||
"https://api.github.com/repos/${urlEscape owner}/$(${urlEscapeSh} "$repo")/releases/latest")
|
||||
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message')" =~ "rate limit" ]]; then
|
||||
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
|
||||
echo "rate limited" >&2
|
||||
fi
|
||||
release=$(printf "%s" "$res" | ${jq}/bin/jq '.tag_name')
|
||||
|
@ -34,6 +93,21 @@ let
|
|||
echo "$release"
|
||||
'';
|
||||
|
||||
# find the latest repos of a github organization
|
||||
latestGithubRepos = { orga }: writeShellScript "latest-github-repos" ''
|
||||
set -euo pipefail
|
||||
res=$(${curl}/bin/curl \
|
||||
--silent \
|
||||
'https://api.github.com/orgs/${orga}/repos?per_page=100')
|
||||
|
||||
if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then
|
||||
echo "rate limited" >&2
|
||||
fi
|
||||
|
||||
printf "%s" "$res" | ${jq}/bin/jq 'map(.name)' \
|
||||
|| echo "failed $res"
|
||||
'';
|
||||
|
||||
# update one tree-sitter grammar repo and print their nix-prefetch-git output
|
||||
updateGrammar = { owner }: writeShellScript "update-grammar.sh" ''
|
||||
set -euo pipefail
|
||||
|
@ -49,18 +123,22 @@ let
|
|||
|
||||
update-all-grammars = writeShellScript "update-all-grammars.sh" ''
|
||||
set -euo pipefail
|
||||
grammarNames=$(${getGrammarNames})
|
||||
echo "fetching list of grammars" 1>&2
|
||||
grammars=$(${latestGithubRepos { orga = "tree-sitter"; }})
|
||||
echo "checking against the list of grammars we know" 1>&2
|
||||
knownGrammars=$(printf '%s' "$grammars" | ${checkKnownGrammars})
|
||||
# change the json list into a item-per-line bash format
|
||||
grammarNames=$(printf '%s' "$knownGrammars" | ${jq}/bin/jq --raw-output '.[]')
|
||||
outputDir="${toString ./.}/grammars"
|
||||
mkdir -p "$outputDir"
|
||||
updateCommand=$(printf \
|
||||
'${updateGrammar { owner = "tree-sitter"; }} "$1" > "%s/$1.json"' \
|
||||
"$outputDir")
|
||||
printf '%s' "$grammarNames" \
|
||||
| ${xe}/bin/xe printf "tree-sitter-%s\n" {} \
|
||||
| ${xe}/bin/xe -j2 -s "$updateCommand"
|
||||
( echo "{"
|
||||
printf '%s' "$grammarNames" \
|
||||
| ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./tree-sitter-%s.json));\n" "$1" "$1"'
|
||||
| ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./%s.json));\n" "$1" "$1"'
|
||||
echo "}" ) \
|
||||
> "$outputDir/default.nix"
|
||||
'';
|
||||
|
|
Loading…
Reference in a new issue