You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

211 lines
6.4 KiB
Nix

with builtins;
# Tokenizer.
let
layout_pat = "[ \n]+";
layout_pat_opt = "[ \n]*";
token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[a-zA-Z0-9_-]+|"[^"]*"''; #"
tokenizer_1_11 = str:
let
tokenizer_rec = len: prevTokens: patterns: str:
let
pattern = head patterns;
layoutAndTokens = match pattern str;
matchLength = stringLength (head layoutAndTokens);
tokens = prevTokens ++ tail layoutAndTokens;
in
if layoutAndTokens == null then
# if we cannot reduce the pattern, return the list of token
if tail patterns == [] then prevTokens
# otherwise, take the next pattern, which only captures half the token.
else tokenizer_rec len prevTokens (tail patterns) str
else tokenizer_rec len tokens patterns (substring matchLength len str);
avgTokenSize = 100;
ceilLog2 = v:
let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
inner 1 1;
# The builtins.match function match the entire string, and generate a list of all captured
# elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
# capture all token of the file. Unfortunately C++ std::regex does not support captures in
# repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
# of 2, such that we can avoid iterating too many times over the content.
generatePatterns = str:
let
depth = ceilLog2 (stringLength str / avgTokenSize);
inner = depth:
if depth == 0 then [ "(${token_pat})" ]
else
let next = inner (depth - 1); in
[ "${head next}${layout_pat}${head next}" ] ++ next;
in
map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);
in
tokenizer_rec (stringLength str) [] (generatePatterns str) str;
tokenizer_1_12 = str:
let
# Nix 1.12 has the builtins.split function which allow to tokenize the
# file quickly. by iterating with a simple regexp.
layoutTokenList = split "(${token_pat})" str;
isLayout = s: match layout_pat_opt s != null;
filterLayout = list:
filter (s:
if isString s then
if isLayout s then false
else throw "Error: Unexpected token: '${s}'"
else true) list;
removeTokenWrapper = list:
map (x: assert tail x == []; head x) list;
in
removeTokenWrapper (filterLayout layoutTokenList);
tokenizer =
if builtins ? split
then tokenizer_1_12
else tokenizer_1_11;
in
# Parse entry headers
let
unescapeString = str:
# Let's ignore any escape character for the moment.
assert match ''"[^"]*"'' str != null; #"
substring 1 (stringLength str - 2) str;
# Match the content of TOML format section names.
ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"''; #"
removeBraces = token: wrapLen:
substring wrapLen (stringLength token - 2 * wrapLen) token;
# Note, this implementation is limited to 11 identifiers.
matchPathFun_1_11 = token:
let
# match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
header_pat =
foldl' (pat: n: "(${ident_pat})([.]${pat})?")
"(${ident_pat})" (genList (n: 0) 10);
matchPath = match header_pat token;
filterDot = filter (s: substring 0 1 s != ".") matchPath;
in
filterDot;
matchPathFun_1_12 = token:
map (e: head e)
(filter (s: isList s)
(split "(${ident_pat})" token));
matchPathFun =
if builtins ? split
then matchPathFun_1_12
else matchPathFun_1_11;
headerToPath = token: wrapLen:
let
token' = removeBraces token wrapLen;
matchPath = matchPathFun token';
path =
map (s:
if substring 0 1 s != ''"'' then s #"
else unescapeString s
) matchPath;
in
assert matchPath != null;
# assert trace "Path: ${token'}; match as ${toString path}" true;
path;
in
# Reconstruct the equivalent attribute set.
let
tokenToValue = token:
if token == "true" then true
else if token == "false" then false
else unescapeString token;
parserInitState = {
idx = 0;
path = [];
isList = false;
output = [];
elem = {};
};
# Imported from nixpkgs library.
setAttrByPath = attrPath: value:
if attrPath == [] then value
else listToAttrs
[ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];
closeSection = state:
state // {
output = state.output ++ [ (setAttrByPath state.path (
if state.isList then [ state.elem ]
else state.elem
)) ];
};
readToken = state: token:
# assert trace "Read '${token}'" true;
if state.idx == 0 then
if substring 0 2 token == "[[" then
(closeSection state) // {
path = headerToPath token 2;
isList = true;
elem = {};
}
else if substring 0 1 token == "[" then
(closeSection state) // {
path = headerToPath token 1;
isList = false;
elem = {};
}
else
assert match "[a-zA-Z0-9_-]+" token != null;
state // { idx = 1; name = token; }
else if state.idx == 1 then
assert token == "=";
state // { idx = 2; }
else
assert state.idx == 2;
state // {
idx = 0;
elem = state.elem // {
"${state.name}" = tokenToValue token;
};
};
# aggregate each section as individual attribute sets.
parser = str:
closeSection (foldl' readToken parserInitState (tokenizer str));
fromTOML = toml:
let
sections = (parser toml).output;
# Inlined from nixpkgs library functions.
zipAttrs = sets:
listToAttrs (map (n: {
name = n;
value =
let v = catAttrs n sets; in
# assert trace "Visiting ${n}" true;
if tail v == [] then head v
else if isList (head v) then concatLists v
else if isAttrs (head v) then zipAttrs v
else throw "cannot merge sections";
}) (concatLists (map attrNames sets)));
in
zipAttrs sections;
in
{
testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
testing_url = fromTOML (builtins.readFile (builtins.fetchurl
https://static.rust-lang.org/dist/channel-rust-nightly.toml));
inherit fromTOML;
}