nixrc/nixpkgs-mozilla/lib/parseTOML.nix

with builtins;
let
  layout_pat = "[ \n]+";
  layout_pat_opt = "[ \n]*";
  token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[a-zA-Z0-9_-]+|"[^"]*"'';
  tokenizer_rec = len: prevTokens: patterns: str:
    let
      pattern = head patterns;
      layoutAndTokens = match pattern str;
      matchLength = stringLength (head layoutAndTokens);
      tokens = prevTokens ++ tail layoutAndTokens;
    in
      if layoutAndTokens == null then
        # if we cannot reduce the pattern, return the list of token
        if tail patterns == [] then prevTokens
        # otherwise, take the next pattern, which only captures half the token.
        else tokenizer_rec len prevTokens (tail patterns) str
      else tokenizer_rec len tokens patterns (substring matchLength len str);

  avgTokenSize = 100;
  ceilLog2 = v:
    let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
    inner 1 1;

  # The builtins.match function match the entire string, and generate a list of all captured
  # elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
  # capture all token of the file. Unfortunately C++ std::regex does not support captures in
  # repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
  # of 2, such that we can avoid iterating too many times over the content.
  generatePatterns = str:
    let
      depth = ceilLog2 (stringLength str / avgTokenSize);
      inner = depth:
        if depth == 0 then [ "(${token_pat})" ]
        else
          let next = inner (depth - 1); in
          [ "${head next}${layout_pat}${head next}" ] ++ next;
    in
      map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);

  tokenizer = str: tokenizer_rec (stringLength str) [] (generatePatterns str) str;

  unescapeString = str:
    # Let's ignore any escape character for the moment.
    assert match ''"[^"]*"'' str != null;
    substring 1 (stringLength str - 2) str;
    
  tokenToValue = token:
    if token == "true" then true
    else if token == "false" then false
    else unescapeString token;

  # Match the content of TOML format section names, and add the grouping such that:
  #   match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
  #
  # Note, this implementation is limited to 11 identifiers.
  ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"'';
  header_pat =
    foldl' (pat: n: "(${ident_pat})([.]${pat})?")
      "(${ident_pat})" (genList (n: 0) 10);

  headerToPath = token: wrapLen:
    let
      token' = substring wrapLen (stringLength token - 2 * wrapLen) token;
      matchPath = match header_pat token';
      filterDot = filter (s: substring 0 1 s != ".") matchPath;
      path =
        map (s:
          if substring 0 1 s != ''"'' then s
          else unescapeString s 
        ) filterDot;
    in 
      assert matchPath != null;
      # assert trace "Path: ${token'}; match as ${toString path}" true;
      path;

  parserInitState = {
    idx = 0;
    path = [];
    isList = false;
    output = [];
    elem = {};
  };

  # Imported from nixpkgs library.
  setAttrByPath = attrPath: value:
    if attrPath == [] then value
    else listToAttrs
      [ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];

  closeSection = state:
    state // {
      output = state.output ++ [ (setAttrByPath state.path (
        if state.isList then [ state.elem ]
        else state.elem
      )) ];
    };

  readToken = state: token:
    # assert trace "Read '${token}'" true;
    if state.idx == 0 then
      if substring 0 2 token == "[[" then
        (closeSection state) // {
          path = headerToPath token 2;
          isList = true;
          elem = {};
        }
      else if substring 0 1 token == "[" then
        (closeSection state) // {
          path = headerToPath token 1;
          isList = false;
          elem = {};
        }
      else
        assert match "[a-zA-Z0-9_-]+" token != null;
        state // { idx = 1; name = token; }
    else if state.idx == 1 then
      assert token == "=";
      state // { idx = 2; }
    else
      assert state.idx == 2;
      state // {
        idx = 0;
        elem = state.elem // {
          "${state.name}" = tokenToValue token;
        };
      };

  # aggregate each section as individual attribute sets.
  parser = str:
    closeSection (foldl' readToken parserInitState (tokenizer str));

  fromTOML = toml:
    let
      sections = (parser toml).output;
      # Inlined from nixpkgs library functions.
      zipAttrs = sets:
        listToAttrs (map (n: {
          name = n;
          value =
            let v = catAttrs n sets; in
            # assert trace "Visiting ${n}" true;
            if tail v == [] then head v 
            else if isList (head v) then concatLists v
            else if isAttrs (head v) then zipAttrs v
            else throw "cannot merge sections";
        }) (concatLists (map attrNames sets)));
    in
      zipAttrs sections;
in

{
  testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
  testing_url = fromTOML (builtins.readFile (builtins.fetchurl
  https://static.rust-lang.org/dist/channel-rust-nightly.toml));
  inherit fromTOML;
}
Include nixpkgs-mozilla 7 years ago			`with builtins;`
			`let`
			`layout_pat = "[ \n]+";`
			`layout_pat_opt = "[ \n]*";`
			`token_pat = ''=\|[[][[][a-zA-Z0-9_."-]+[]][]]\|[[][a-zA-Z0-9_."-]+[]]\|[a-zA-Z0-9_-]+\|"[^"]*"'';`
			`tokenizer_rec = len: prevTokens: patterns: str:`
			`let`
			`pattern = head patterns;`
			`layoutAndTokens = match pattern str;`
			`matchLength = stringLength (head layoutAndTokens);`
			`tokens = prevTokens ++ tail layoutAndTokens;`
			`in`
			`if layoutAndTokens == null then`
			`# if we cannot reduce the pattern, return the list of token`
			`if tail patterns == [] then prevTokens`
			`# otherwise, take the next pattern, which only captures half the token.`
			`else tokenizer_rec len prevTokens (tail patterns) str`
			`else tokenizer_rec len tokens patterns (substring matchLength len str);`

			`avgTokenSize = 100;`
			`ceilLog2 = v:`
			`let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in`
			`inner 1 1;`

			`# The builtins.match function match the entire string, and generate a list of all captured`
			`# elements. This is the most efficient way to make a tokenizer, if we can make a pattern which`
			`# capture all token of the file. Unfortunately C++ std::regex does not support captures in`
			`# repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple`
			`# of 2, such that we can avoid iterating too many times over the content.`
			`generatePatterns = str:`
			`let`
			`depth = ceilLog2 (stringLength str / avgTokenSize);`
			`inner = depth:`
			`if depth == 0 then [ "(${token_pat})" ]`
			`else`
			`let next = inner (depth - 1); in`
			`[ "${head next}${layout_pat}${head next}" ] ++ next;`
			`in`
			`map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);`

			`tokenizer = str: tokenizer_rec (stringLength str) [] (generatePatterns str) str;`

			`unescapeString = str:`
			`# Let's ignore any escape character for the moment.`
			`assert match ''"[^"]*"'' str != null;`
			`substring 1 (stringLength str - 2) str;`

			`tokenToValue = token:`
			`if token == "true" then true`
			`else if token == "false" then false`
			`else unescapeString token;`

			`# Match the content of TOML format section names, and add the grouping such that:`
			`# match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]`
			`#`
			`# Note, this implementation is limited to 11 identifiers.`
			`ident_pat = ''[a-zA-Z0-9_-]+\|"[^"]*"'';`
			`header_pat =`
			`foldl' (pat: n: "(${ident_pat})([.]${pat})?")`
			`"(${ident_pat})" (genList (n: 0) 10);`

			`headerToPath = token: wrapLen:`
			`let`
			`token' = substring wrapLen (stringLength token - 2 * wrapLen) token;`
			`matchPath = match header_pat token';`
			`filterDot = filter (s: substring 0 1 s != ".") matchPath;`
			`path =`
			`map (s:`
			`if substring 0 1 s != ''"'' then s`
			`else unescapeString s`
			`) filterDot;`
			`in`
			`assert matchPath != null;`
			`# assert trace "Path: ${token'}; match as ${toString path}" true;`
			`path;`

			`parserInitState = {`
			`idx = 0;`
			`path = [];`
			`isList = false;`
			`output = [];`
			`elem = {};`
			`};`

			`# Imported from nixpkgs library.`
			`setAttrByPath = attrPath: value:`
			`if attrPath == [] then value`
			`else listToAttrs`
			`[ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];`

			`closeSection = state:`
			`state // {`
			`output = state.output ++ [ (setAttrByPath state.path (`
			`if state.isList then [ state.elem ]`
			`else state.elem`
			`)) ];`
			`};`

			`readToken = state: token:`
			`# assert trace "Read '${token}'" true;`
			`if state.idx == 0 then`
			`if substring 0 2 token == "[[" then`
			`(closeSection state) // {`
			`path = headerToPath token 2;`
			`isList = true;`
			`elem = {};`
			`}`
			`else if substring 0 1 token == "[" then`
			`(closeSection state) // {`
			`path = headerToPath token 1;`
			`isList = false;`
			`elem = {};`
			`}`
			`else`
			`assert match "[a-zA-Z0-9_-]+" token != null;`
			`state // { idx = 1; name = token; }`
			`else if state.idx == 1 then`
			`assert token == "=";`
			`state // { idx = 2; }`
			`else`
			`assert state.idx == 2;`
			`state // {`
			`idx = 0;`
			`elem = state.elem // {`
			`"${state.name}" = tokenToValue token;`
			`};`
			`};`

			`# aggregate each section as individual attribute sets.`
			`parser = str:`
			`closeSection (foldl' readToken parserInitState (tokenizer str));`

			`fromTOML = toml:`
			`let`
			`sections = (parser toml).output;`
			`# Inlined from nixpkgs library functions.`
			`zipAttrs = sets:`
			`listToAttrs (map (n: {`
			`name = n;`
			`value =`
			`let v = catAttrs n sets; in`
			`# assert trace "Visiting ${n}" true;`
			`if tail v == [] then head v`
			`else if isList (head v) then concatLists v`
			`else if isAttrs (head v) then zipAttrs v`
			`else throw "cannot merge sections";`
			`}) (concatLists (map attrNames sets)));`
			`in`
			`zipAttrs sections;`
			`in`

			`{`
			`testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);`
			`testing_url = fromTOML (builtins.readFile (builtins.fetchurl`
			`https://static.rust-lang.org/dist/channel-rust-nightly.toml));`
			`inherit fromTOML;`
			`}`