You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
5.1 KiB
Nix
159 lines
5.1 KiB
Nix
7 years ago
|
with builtins;
|
||
|
let
|
||
|
layout_pat = "[ \n]+";
|
||
|
layout_pat_opt = "[ \n]*";
|
||
|
token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[a-zA-Z0-9_-]+|"[^"]*"'';
|
||
|
tokenizer_rec = len: prevTokens: patterns: str:
|
||
|
let
|
||
|
pattern = head patterns;
|
||
|
layoutAndTokens = match pattern str;
|
||
|
matchLength = stringLength (head layoutAndTokens);
|
||
|
tokens = prevTokens ++ tail layoutAndTokens;
|
||
|
in
|
||
|
if layoutAndTokens == null then
|
||
|
# if we cannot reduce the pattern, return the list of token
|
||
|
if tail patterns == [] then prevTokens
|
||
|
# otherwise, take the next pattern, which only captures half the token.
|
||
|
else tokenizer_rec len prevTokens (tail patterns) str
|
||
|
else tokenizer_rec len tokens patterns (substring matchLength len str);
|
||
|
|
||
|
avgTokenSize = 100;
|
||
|
ceilLog2 = v:
|
||
|
let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
|
||
|
inner 1 1;
|
||
|
|
||
|
# The builtins.match function match the entire string, and generate a list of all captured
|
||
|
# elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
|
||
|
# capture all token of the file. Unfortunately C++ std::regex does not support captures in
|
||
|
# repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
|
||
|
# of 2, such that we can avoid iterating too many times over the content.
|
||
|
generatePatterns = str:
|
||
|
let
|
||
|
depth = ceilLog2 (stringLength str / avgTokenSize);
|
||
|
inner = depth:
|
||
|
if depth == 0 then [ "(${token_pat})" ]
|
||
|
else
|
||
|
let next = inner (depth - 1); in
|
||
|
[ "${head next}${layout_pat}${head next}" ] ++ next;
|
||
|
in
|
||
|
map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);
|
||
|
|
||
|
tokenizer = str: tokenizer_rec (stringLength str) [] (generatePatterns str) str;
|
||
|
|
||
|
unescapeString = str:
|
||
|
# Let's ignore any escape character for the moment.
|
||
|
assert match ''"[^"]*"'' str != null;
|
||
|
substring 1 (stringLength str - 2) str;
|
||
|
|
||
|
tokenToValue = token:
|
||
|
if token == "true" then true
|
||
|
else if token == "false" then false
|
||
|
else unescapeString token;
|
||
|
|
||
|
# Match the content of TOML format section names, and add the grouping such that:
|
||
|
# match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
|
||
|
#
|
||
|
# Note, this implementation is limited to 11 identifiers.
|
||
|
ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"'';
|
||
|
header_pat =
|
||
|
foldl' (pat: n: "(${ident_pat})([.]${pat})?")
|
||
|
"(${ident_pat})" (genList (n: 0) 10);
|
||
|
|
||
|
headerToPath = token: wrapLen:
|
||
|
let
|
||
|
token' = substring wrapLen (stringLength token - 2 * wrapLen) token;
|
||
|
matchPath = match header_pat token';
|
||
|
filterDot = filter (s: substring 0 1 s != ".") matchPath;
|
||
|
path =
|
||
|
map (s:
|
||
|
if substring 0 1 s != ''"'' then s
|
||
|
else unescapeString s
|
||
|
) filterDot;
|
||
|
in
|
||
|
assert matchPath != null;
|
||
|
# assert trace "Path: ${token'}; match as ${toString path}" true;
|
||
|
path;
|
||
|
|
||
|
parserInitState = {
|
||
|
idx = 0;
|
||
|
path = [];
|
||
|
isList = false;
|
||
|
output = [];
|
||
|
elem = {};
|
||
|
};
|
||
|
|
||
|
# Imported from nixpkgs library.
|
||
|
setAttrByPath = attrPath: value:
|
||
|
if attrPath == [] then value
|
||
|
else listToAttrs
|
||
|
[ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];
|
||
|
|
||
|
closeSection = state:
|
||
|
state // {
|
||
|
output = state.output ++ [ (setAttrByPath state.path (
|
||
|
if state.isList then [ state.elem ]
|
||
|
else state.elem
|
||
|
)) ];
|
||
|
};
|
||
|
|
||
|
readToken = state: token:
|
||
|
# assert trace "Read '${token}'" true;
|
||
|
if state.idx == 0 then
|
||
|
if substring 0 2 token == "[[" then
|
||
|
(closeSection state) // {
|
||
|
path = headerToPath token 2;
|
||
|
isList = true;
|
||
|
elem = {};
|
||
|
}
|
||
|
else if substring 0 1 token == "[" then
|
||
|
(closeSection state) // {
|
||
|
path = headerToPath token 1;
|
||
|
isList = false;
|
||
|
elem = {};
|
||
|
}
|
||
|
else
|
||
|
assert match "[a-zA-Z0-9_-]+" token != null;
|
||
|
state // { idx = 1; name = token; }
|
||
|
else if state.idx == 1 then
|
||
|
assert token == "=";
|
||
|
state // { idx = 2; }
|
||
|
else
|
||
|
assert state.idx == 2;
|
||
|
state // {
|
||
|
idx = 0;
|
||
|
elem = state.elem // {
|
||
|
"${state.name}" = tokenToValue token;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
# aggregate each section as individual attribute sets.
|
||
|
parser = str:
|
||
|
closeSection (foldl' readToken parserInitState (tokenizer str));
|
||
|
|
||
|
fromTOML = toml:
|
||
|
let
|
||
|
sections = (parser toml).output;
|
||
|
# Inlined from nixpkgs library functions.
|
||
|
zipAttrs = sets:
|
||
|
listToAttrs (map (n: {
|
||
|
name = n;
|
||
|
value =
|
||
|
let v = catAttrs n sets; in
|
||
|
# assert trace "Visiting ${n}" true;
|
||
|
if tail v == [] then head v
|
||
|
else if isList (head v) then concatLists v
|
||
|
else if isAttrs (head v) then zipAttrs v
|
||
|
else throw "cannot merge sections";
|
||
|
}) (concatLists (map attrNames sets)));
|
||
|
in
|
||
|
zipAttrs sections;
|
||
|
in
|
||
|
|
||
|
{
|
||
|
testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
|
||
|
testing_url = fromTOML (builtins.readFile (builtins.fetchurl
|
||
|
https://static.rust-lang.org/dist/channel-rust-nightly.toml));
|
||
|
inherit fromTOML;
|
||
|
}
|
||
|
|