nixrc/nixpkgs-mozilla/lib/parseTOML.nix

with builtins;

# Tokenizer.
let
  layout_pat = "[ \n]+";
  layout_pat_opt = "[ \n]*";
  token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[a-zA-Z0-9_-]+|"[^"]*"''; #"

  tokenizer_1_11 = str:
    let
      tokenizer_rec = len: prevTokens: patterns: str:
        let
          pattern = head patterns;
          layoutAndTokens = match pattern str;
          matchLength = stringLength (head layoutAndTokens);
          tokens = prevTokens ++ tail layoutAndTokens;
        in
          if layoutAndTokens == null then
            # if we cannot reduce the pattern, return the list of token
            if tail patterns == [] then prevTokens
            # otherwise, take the next pattern, which only captures half the token.
            else tokenizer_rec len prevTokens (tail patterns) str
          else tokenizer_rec len tokens patterns (substring matchLength len str);

      avgTokenSize = 100;
      ceilLog2 = v:
        let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
        inner 1 1;

      # The builtins.match function match the entire string, and generate a list of all captured
      # elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
      # capture all token of the file. Unfortunately C++ std::regex does not support captures in
      # repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
      # of 2, such that we can avoid iterating too many times over the content.
      generatePatterns = str:
        let
          depth = ceilLog2 (stringLength str / avgTokenSize);
          inner = depth:
            if depth == 0 then [ "(${token_pat})" ]
            else
              let next = inner (depth - 1); in
              [ "${head next}${layout_pat}${head next}" ] ++ next;
        in
          map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);

    in
      tokenizer_rec (stringLength str) [] (generatePatterns str) str;

  tokenizer_1_12 = str:
    let
      # Nix 1.12 has the builtins.split function which allow to tokenize the
      # file quickly. by iterating with a simple regexp.
      layoutTokenList = split "(${token_pat})" str;
      isLayout = s: match layout_pat_opt s != null;
      filterLayout = list:
        filter (s:
          if isString s then
            if isLayout s then false
            else throw "Error: Unexpected token: '${s}'"
          else true) list;
      removeTokenWrapper = list:
        map (x: assert tail x == []; head x) list;
    in
      removeTokenWrapper (filterLayout layoutTokenList);

  tokenizer =
    if builtins ? split
    then tokenizer_1_12
    else tokenizer_1_11;
in

# Parse entry headers
let
  unescapeString = str:
    # Let's ignore any escape character for the moment.
    assert match ''"[^"]*"'' str != null; #"
    substring 1 (stringLength str - 2) str;

  # Match the content of TOML format section names.
  ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"''; #"

  removeBraces = token: wrapLen:
    substring wrapLen (stringLength token - 2 * wrapLen) token;

  # Note, this implementation is limited to 11 identifiers.
  matchPathFun_1_11 = token:
    let
      # match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
      header_pat =
        foldl' (pat: n: "(${ident_pat})([.]${pat})?")
           "(${ident_pat})" (genList (n: 0) 10);
      matchPath = match header_pat token;
      filterDot = filter (s: substring 0 1 s != ".") matchPath;
    in
      filterDot;

  matchPathFun_1_12 = token:
    map (e: head e)
      (filter (s: isList s)
        (split "(${ident_pat})" token));

  matchPathFun =
    if builtins ? split
    then matchPathFun_1_12
    else matchPathFun_1_11;

  headerToPath = token: wrapLen:
    let
      token' = removeBraces token wrapLen;
      matchPath = matchPathFun token';
      path =
        map (s:
          if substring 0 1 s != ''"'' then s #"
          else unescapeString s 
        ) matchPath;
    in 
      assert matchPath != null;
      # assert trace "Path: ${token'}; match as ${toString path}" true;
      path;
in

# Reconstruct the equivalent attribute set.
let
  tokenToValue = token:
    if token == "true" then true
    else if token == "false" then false
    else unescapeString token;

  parserInitState = {
    idx = 0;
    path = [];
    isList = false;
    output = [];
    elem = {};
  };

  # Imported from nixpkgs library.
  setAttrByPath = attrPath: value:
    if attrPath == [] then value
    else listToAttrs
      [ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];

  closeSection = state:
    state // {
      output = state.output ++ [ (setAttrByPath state.path (
        if state.isList then [ state.elem ]
        else state.elem
      )) ];
    };

  readToken = state: token:
    # assert trace "Read '${token}'" true;
    if state.idx == 0 then
      if substring 0 2 token == "[[" then
        (closeSection state) // {
          path = headerToPath token 2;
          isList = true;
          elem = {};
        }
      else if substring 0 1 token == "[" then
        (closeSection state) // {
          path = headerToPath token 1;
          isList = false;
          elem = {};
        }
      else
        assert match "[a-zA-Z0-9_-]+" token != null;
        state // { idx = 1; name = token; }
    else if state.idx == 1 then
      assert token == "=";
      state // { idx = 2; }
    else
      assert state.idx == 2;
      state // {
        idx = 0;
        elem = state.elem // {
          "${state.name}" = tokenToValue token;
        };
      };

  # aggregate each section as individual attribute sets.
  parser = str:
    closeSection (foldl' readToken parserInitState (tokenizer str));

  fromTOML = toml:
    let
      sections = (parser toml).output;
      # Inlined from nixpkgs library functions.
      zipAttrs = sets:
        listToAttrs (map (n: {
          name = n;
          value =
            let v = catAttrs n sets; in
            # assert trace "Visiting ${n}" true;
            if tail v == [] then head v 
            else if isList (head v) then concatLists v
            else if isAttrs (head v) then zipAttrs v
            else throw "cannot merge sections";
        }) (concatLists (map attrNames sets)));
    in
      zipAttrs sections;
in

{
  testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
  testing_url = fromTOML (builtins.readFile (builtins.fetchurl
  https://static.rust-lang.org/dist/channel-rust-nightly.toml));
  inherit fromTOML;
}
Include nixpkgs-mozilla 7 years ago			`with builtins;`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago
			`# Tokenizer.`
Include nixpkgs-mozilla 7 years ago			`let`
			`layout_pat = "[ \n]+";`
			`layout_pat_opt = "[ \n]*";`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`token_pat = ''=\|[[][[][a-zA-Z0-9_."-]+[]][]]\|[[][a-zA-Z0-9_."-]+[]]\|[a-zA-Z0-9_-]+\|"[^"]*"''; #"`

			`tokenizer_1_11 = str:`
Include nixpkgs-mozilla 7 years ago			`let`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`tokenizer_rec = len: prevTokens: patterns: str:`
			`let`
			`pattern = head patterns;`
			`layoutAndTokens = match pattern str;`
			`matchLength = stringLength (head layoutAndTokens);`
			`tokens = prevTokens ++ tail layoutAndTokens;`
			`in`
			`if layoutAndTokens == null then`
			`# if we cannot reduce the pattern, return the list of token`
			`if tail patterns == [] then prevTokens`
			`# otherwise, take the next pattern, which only captures half the token.`
			`else tokenizer_rec len prevTokens (tail patterns) str`
			`else tokenizer_rec len tokens patterns (substring matchLength len str);`

			`avgTokenSize = 100;`
			`ceilLog2 = v:`
			`let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in`
			`inner 1 1;`

			`# The builtins.match function match the entire string, and generate a list of all captured`
			`# elements. This is the most efficient way to make a tokenizer, if we can make a pattern which`
			`# capture all token of the file. Unfortunately C++ std::regex does not support captures in`
			`# repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple`
			`# of 2, such that we can avoid iterating too many times over the content.`
			`generatePatterns = str:`
			`let`
			`depth = ceilLog2 (stringLength str / avgTokenSize);`
			`inner = depth:`
			`if depth == 0 then [ "(${token_pat})" ]`
			`else`
			`let next = inner (depth - 1); in`
			`[ "${head next}${layout_pat}${head next}" ] ++ next;`
			`in`
			`map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);`

Include nixpkgs-mozilla 7 years ago			`in`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`tokenizer_rec (stringLength str) [] (generatePatterns str) str;`

			`tokenizer_1_12 = str:`
Include nixpkgs-mozilla 7 years ago			`let`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`# Nix 1.12 has the builtins.split function which allow to tokenize the`
			`# file quickly. by iterating with a simple regexp.`
			`layoutTokenList = split "(${token_pat})" str;`
			`isLayout = s: match layout_pat_opt s != null;`
			`filterLayout = list:`
			`filter (s:`
			`if isString s then`
			`if isLayout s then false`
			`else throw "Error: Unexpected token: '${s}'"`
			`else true) list;`
			`removeTokenWrapper = list:`
			`map (x: assert tail x == []; head x) list;`
Include nixpkgs-mozilla 7 years ago			`in`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`removeTokenWrapper (filterLayout layoutTokenList);`
Include nixpkgs-mozilla 7 years ago
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`tokenizer =`
			`if builtins ? split`
			`then tokenizer_1_12`
			`else tokenizer_1_11;`
			`in`
Include nixpkgs-mozilla 7 years ago
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`# Parse entry headers`
			`let`
Include nixpkgs-mozilla 7 years ago			`unescapeString = str:`
			`# Let's ignore any escape character for the moment.`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`assert match ''"[^"]*"'' str != null; #"`
Include nixpkgs-mozilla 7 years ago			`substring 1 (stringLength str - 2) str;`

Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`# Match the content of TOML format section names.`
			`ident_pat = ''[a-zA-Z0-9_-]+\|"[^"]*"''; #"`

			`removeBraces = token: wrapLen:`
			`substring wrapLen (stringLength token - 2 * wrapLen) token;`

Include nixpkgs-mozilla 7 years ago			`# Note, this implementation is limited to 11 identifiers.`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`matchPathFun_1_11 = token:`
			`let`
			`# match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]`
			`header_pat =`
			`foldl' (pat: n: "(${ident_pat})([.]${pat})?")`
			`"(${ident_pat})" (genList (n: 0) 10);`
			`matchPath = match header_pat token;`
			`filterDot = filter (s: substring 0 1 s != ".") matchPath;`
			`in`
			`filterDot;`

			`matchPathFun_1_12 = token:`
			`map (e: head e)`
			`(filter (s: isList s)`
			`(split "(${ident_pat})" token));`

			`matchPathFun =`
			`if builtins ? split`
			`then matchPathFun_1_12`
			`else matchPathFun_1_11;`
Include nixpkgs-mozilla 7 years ago
			`headerToPath = token: wrapLen:`
			`let`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`token' = removeBraces token wrapLen;`
			`matchPath = matchPathFun token';`
Include nixpkgs-mozilla 7 years ago			`path =`
			`map (s:`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`if substring 0 1 s != ''"'' then s #"`
Include nixpkgs-mozilla 7 years ago			`else unescapeString s`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`) matchPath;`
Include nixpkgs-mozilla 7 years ago			`in`
			`assert matchPath != null;`
			`# assert trace "Path: ${token'}; match as ${toString path}" true;`
			`path;`
Update nixpkgs-mozilla to NixOS 18.03-compatible version 6 years ago			`in`

			`# Reconstruct the equivalent attribute set.`
			`let`
			`tokenToValue = token:`
			`if token == "true" then true`
			`else if token == "false" then false`
			`else unescapeString token;`
Include nixpkgs-mozilla 7 years ago
			`parserInitState = {`
			`idx = 0;`
			`path = [];`
			`isList = false;`
			`output = [];`
			`elem = {};`
			`};`

			`# Imported from nixpkgs library.`
			`setAttrByPath = attrPath: value:`
			`if attrPath == [] then value`
			`else listToAttrs`
			`[ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];`

			`closeSection = state:`
			`state // {`
			`output = state.output ++ [ (setAttrByPath state.path (`
			`if state.isList then [ state.elem ]`
			`else state.elem`
			`)) ];`
			`};`

			`readToken = state: token:`
			`# assert trace "Read '${token}'" true;`
			`if state.idx == 0 then`
			`if substring 0 2 token == "[[" then`
			`(closeSection state) // {`
			`path = headerToPath token 2;`
			`isList = true;`
			`elem = {};`
			`}`
			`else if substring 0 1 token == "[" then`
			`(closeSection state) // {`
			`path = headerToPath token 1;`
			`isList = false;`
			`elem = {};`
			`}`
			`else`
			`assert match "[a-zA-Z0-9_-]+" token != null;`
			`state // { idx = 1; name = token; }`
			`else if state.idx == 1 then`
			`assert token == "=";`
			`state // { idx = 2; }`
			`else`
			`assert state.idx == 2;`
			`state // {`
			`idx = 0;`
			`elem = state.elem // {`
			`"${state.name}" = tokenToValue token;`
			`};`
			`};`

			`# aggregate each section as individual attribute sets.`
			`parser = str:`
			`closeSection (foldl' readToken parserInitState (tokenizer str));`

			`fromTOML = toml:`
			`let`
			`sections = (parser toml).output;`
			`# Inlined from nixpkgs library functions.`
			`zipAttrs = sets:`
			`listToAttrs (map (n: {`
			`name = n;`
			`value =`
			`let v = catAttrs n sets; in`
			`# assert trace "Visiting ${n}" true;`
			`if tail v == [] then head v`
			`else if isList (head v) then concatLists v`
			`else if isAttrs (head v) then zipAttrs v`
			`else throw "cannot merge sections";`
			`}) (concatLists (map attrNames sets)));`
			`in`
			`zipAttrs sections;`
			`in`

			`{`
			`testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);`
			`testing_url = fromTOML (builtins.readFile (builtins.fetchurl`
			`https://static.rust-lang.org/dist/channel-rust-nightly.toml));`
			`inherit fromTOML;`
			`}`