elipl/lib/til/parser.ex
2025-07-11 21:45:31 +02:00

1010 lines
35 KiB
Elixir

defmodule Til.Parser do
@moduledoc """
Parser for the Tilly Lisp dialect.
It transforms source code into a collection of Node Maps.
"""
# Represents the current parsing position
defstruct offset: 0, line: 1, col: 1, file_name: "unknown", nodes: %{}
@doc """
Parses a source string into a map of AST nodes.
"""
def parse(source_string, file_name \\ "unknown") do
file_node_id = System.unique_integer([:monotonic, :positive])
# Initial location for the file node (starts at the beginning)
file_start_offset = 0
file_start_line = 1
file_start_col = 1
# End location and raw_string will be finalized after parsing all content
prelim_file_node = %{
id: file_node_id,
type_id: nil,
# File node is the root
parent_id: nil,
file: file_name,
# End TBD
location: [file_start_offset, file_start_line, file_start_col, 0, 0, 0],
# TBD
raw_string: "",
ast_node_type: :file,
# TBD
children: [],
parsing_error: nil
}
initial_state = %__MODULE__{
file_name: file_name,
# Add prelim file node
nodes: %{file_node_id => prelim_file_node},
# Initial state offset should be 0 for the file
offset: 0,
# Initial state line should be 1
line: 1,
# Initial state col should be 1
col: 1
}
# Pass original_source_string for raw_string extraction, and file_node_id as parent for top-level exprs
final_state_after_expressions =
parse_all_expressions(source_string, source_string, initial_state, file_node_id)
# Finalize the file node
# Calculate end position of the entire source string
{file_end_line, file_end_col} = calculate_new_line_col(source_string, 1, 1)
# Offset is 0-indexed, length is the count of characters, so end_offset is length.
file_end_offset = String.length(source_string)
# Collect children of the file node
file_children_ids =
final_state_after_expressions.nodes
|> Map.values()
|> Enum.filter(&(&1.parent_id == file_node_id))
# Sort by start offset to maintain order of appearance in the source
|> Enum.sort_by(fn node -> hd(node.location) end)
|> Enum.map(& &1.id)
updated_file_node =
final_state_after_expressions.nodes
|> Map.get(file_node_id)
|> Map.merge(%{
location: [
file_start_offset,
file_start_line,
file_start_col,
file_end_offset,
file_end_line,
file_end_col
],
# The entire source is the raw string of the file node
raw_string: source_string,
children: file_children_ids
})
final_nodes =
Map.put(final_state_after_expressions.nodes, file_node_id, updated_file_node)
{:ok, final_nodes}
end
# --- Main Parsing Logic ---
# original_source_string is the complete initial source, source_string is the current remainder
# parent_id_for_top_level_expressions is the ID of the node that top-level expressions should be parented to (e.g., the :file node)
defp parse_all_expressions(
original_source_string,
source_string,
state,
parent_id_for_top_level_expressions
) do
case skip_whitespace(source_string, state) do
{:eos, final_state} ->
final_state
{:ok, remaining_source, current_state} ->
if remaining_source == "" do
# All content parsed, nothing left after skipping whitespace.
# This is a successful termination of parsing for the current branch.
current_state
else
# There's actual content to parse.
case parse_datum(
original_source_string,
remaining_source,
current_state,
parent_id_for_top_level_expressions
) do
{:ok, _node_id, next_source, next_state} ->
parse_all_expressions(
original_source_string,
next_source,
next_state,
parent_id_for_top_level_expressions
)
{:error_node, _node_id, _reason, next_source, next_state} ->
# An error node was created by parse_datum.
# Input was consumed. Continue parsing from next_source.
parse_all_expressions(
original_source_string,
next_source,
next_state,
parent_id_for_top_level_expressions
)
# NOTE: This relies on parse_datum and its components (like create_error_node_and_advance)
# to always consume input if source_string is not empty. If parse_datum could return
# :error_node without consuming input on a non-empty string, an infinite loop is possible.
# Current implementation of create_error_node_and_advance consumes 1 char.
end
end
end
end
# Parses a single datum: an atom (integer, symbol) or a list.
defp parse_datum(original_source_string, source, state, parent_id) do
# Peek for multi-character tokens first
cond do
String.starts_with?(source, "m{") ->
# Returns {:ok | :error_node, ...}
parse_map_expression(original_source_string, source, state, parent_id)
# Fallback to single character dispatch
true ->
char = String.first(source)
cond do
char == "(" ->
# Returns {:ok | :error_node, ...}
parse_s_expression(original_source_string, source, state, parent_id)
char == ")" ->
# Unexpected closing parenthesis, consume 1 char for the error token ')'
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ')'")
char == "[" ->
# Returns {:ok | :error_node, ...}
parse_list_expression(original_source_string, source, state, parent_id)
char == "]" ->
# Unexpected closing square bracket, consume 1 char for the error token ']'
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ']'")
# For tuples
char == "{" ->
# Returns {:ok | :error_node, ...}
parse_tuple_expression(original_source_string, source, state, parent_id)
char == "}" ->
# Unexpected closing curly brace
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected '}'")
char == "'" ->
# Returns {:ok | :error_node, ...}
parse_string_datum(original_source_string, source, state, parent_id)
char == ":" ->
# If the first char is ':', try to parse as an atom like :foo
case parse_atom_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_atom} ->
# Failed to parse as a specific atom (e.g. ":foo").
# It could be a symbol that starts with ':' (e.g. if we allow ":" as a symbol).
# Fallback to general symbol parsing. Integer parsing won't match if it starts with ':'.
case parse_symbol_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_symbol} ->
# If it started with ':' but wasn't a valid atom and also not a valid symbol
create_error_node_and_advance(
source,
state,
parent_id,
1,
"Unknown token starting with ':'"
)
end
end
true ->
# Default case for other characters
# Try parsing as an integer first
case parse_integer_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_integer} ->
# Not an integer, try parsing as a symbol
case parse_symbol_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_symbol} ->
# Not a symbol either. Consume 1 char for the unknown token.
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token")
end
end
end
# end inner cond
end
# end outer cond
end
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
defp parse_string_datum(_original_source_string, source, state, parent_id) do
# state is before consuming "'"
initial_state_for_token = state
strip_indent = initial_state_for_token.col - 1
# Consume opening "'"
{opening_tick, source_after_opening_tick} = String.split_at(source, 1)
case :binary.match(source_after_opening_tick, "'") do
:nomatch ->
# Unclosed string
content_segment = source_after_opening_tick
raw_token = opening_tick <> content_segment
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
location = [
initial_state_for_token.offset,
initial_state_for_token.line,
initial_state_for_token.col,
state_at_node_end.offset,
state_at_node_end.line,
state_at_node_end.col
]
processed_value = process_string_content(content_segment, strip_indent)
{node_id, state_with_error_node} =
add_node(
initial_state_for_token,
parent_id,
location,
raw_token,
:literal_string,
%{value: processed_value, parsing_error: "Unclosed string literal"}
)
final_state = %{
state_with_error_node
| offset: state_at_node_end.offset,
line: state_at_node_end.line,
col: state_at_node_end.col
}
{:error_node, node_id, "Unclosed string literal", "", final_state}
# _tick_length will be 1 for "`"
{idx_closing_tick_in_segment, _tick_length} ->
content_segment =
String.slice(source_after_opening_tick, 0, idx_closing_tick_in_segment)
closing_tick = "'"
raw_token = opening_tick <> content_segment <> closing_tick
rest_of_source =
String.slice(source_after_opening_tick, (idx_closing_tick_in_segment + 1)..-1//1)
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
location = [
initial_state_for_token.offset,
initial_state_for_token.line,
initial_state_for_token.col,
state_at_node_end.offset,
state_at_node_end.line,
state_at_node_end.col
]
processed_value = process_string_content(content_segment, strip_indent)
{new_node_id, state_with_node} =
add_node(
initial_state_for_token,
parent_id,
location,
raw_token,
:literal_string,
%{value: processed_value}
)
final_state = %{
state_with_node
| offset: state_at_node_end.offset,
line: state_at_node_end.line,
col: state_at_node_end.col
}
{:ok, new_node_id, rest_of_source, final_state}
end
end
defp process_string_content(content_str, strip_indent) when strip_indent >= 0 do
lines = String.split(content_str, "\n", trim: false)
# Will always exist, even for empty content_str -> ""
first_line = List.first(lines)
rest_lines =
if length(lines) > 1 do
List.delete_at(lines, 0)
else
[]
end
processed_rest_lines =
Enum.map(rest_lines, fn line ->
current_leading_spaces_count =
Regex.run(~r/^(\s*)/, line)
|> List.first()
|> String.length()
spaces_to_remove = min(current_leading_spaces_count, strip_indent)
String.slice(line, spaces_to_remove..-1//1)
end)
all_processed_lines = [first_line | processed_rest_lines]
Enum.join(all_processed_lines, "\n")
end
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
# (parse_string_datum remains unchanged)
defp parse_atom_datum(source, state, parent_id) do
# Atom is a colon followed by one or more non-delimiter characters.
# Delimiters are whitespace, (, ), [, ], {, }.
# The colon itself is part of the atom's raw string.
# The `atom_name_part` is what comes after the colon.
case Regex.run(~r/^:([^\s\(\)\[\]\{\}]+)/, source) do
# raw_atom_str is like ":foo", atom_name_part is "foo"
[raw_atom_str, atom_name_part] ->
# The regex [^...]+ ensures atom_name_part is not empty.
rest_after_atom = String.slice(source, String.length(raw_atom_str)..-1//1)
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_atom_str)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
# Convert the name part (e.g., "foo") to an Elixir atom (e.g., :foo)
atom_value = String.to_atom(atom_name_part)
{new_node_id, state_with_node} =
add_node(
state,
parent_id,
location,
raw_atom_str,
:literal_atom,
%{value: atom_value}
)
final_state = %{
state_with_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:ok, new_node_id, rest_after_atom, final_state}
# No match (nil or list that doesn't conform, e.g., just ":" or ": followed by space/delimiter")
_ ->
{:error, :not_atom}
end
end
defp parse_integer_datum(source, state, parent_id) do
case Integer.parse(source) do
{int_val, rest_after_int} ->
raw_int =
String.slice(source, 0, String.length(source) - String.length(rest_after_int))
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_int)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{new_node_id, state_with_node} =
add_node(state, parent_id, location, raw_int, :literal_integer, %{value: int_val})
# Update state to reflect consumed token
final_state = %{state_with_node | offset: end_offset, line: end_line, col: end_col}
{:ok, new_node_id, rest_after_int, final_state}
:error ->
# Indicates failure, source and state are unchanged by this attempt
{:error, :not_integer}
end
end
defp parse_symbol_datum(source, state, parent_id) do
# Regex excludes common delimiters. `m{` is handled before symbol parsing.
case Regex.run(~r/^([^\s\(\)\[\]\{\}]+)/, source) do
[raw_symbol | _] ->
rest_after_symbol = String.slice(source, String.length(raw_symbol)..-1//1)
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_symbol)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{new_node_id, state_with_node} =
add_node(state, parent_id, location, raw_symbol, :symbol, %{name: raw_symbol})
# Update state to reflect consumed token
final_state = %{
state_with_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:ok, new_node_id, rest_after_symbol, final_state}
nil ->
# Indicates failure, source and state are unchanged by this attempt
{:error, :not_symbol}
end
end
defp create_error_node_and_advance(
source_for_token,
state_before_token,
parent_id,
num_chars_for_token,
error_message
) do
{raw_token, rest_of_source} = String.split_at(source_for_token, num_chars_for_token)
start_offset = state_before_token.offset
start_line = state_before_token.line
start_col = state_before_token.col
state_after_token_consumed = advance_pos(state_before_token, raw_token)
end_offset = state_after_token_consumed.offset
end_line = state_after_token_consumed.line
end_col = state_after_token_consumed.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{error_node_id, state_with_error_node} =
add_node(state_before_token, parent_id, location, raw_token, :unknown, %{
parsing_error: error_message
})
# The state for further parsing must reflect the consumed token's position and include the new error node
final_error_state = %{
state_with_error_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:error_node, error_node_id, error_message, rest_of_source, final_error_state}
end
defp parse_s_expression(original_source_string, source, state, parent_id) do
# Standard S-expression parsing via parse_collection
result =
parse_collection(
original_source_string,
source,
state,
parent_id,
"(",
")",
:s_expression,
"Unclosed S-expression",
"Error parsing element in S-expression. Content might be incomplete."
)
# After parsing, check if it's an 'fn' expression
case result do
{:ok, collection_node_id, rest_after_collection, state_after_collection} ->
collection_node = Map.get(state_after_collection.nodes, collection_node_id)
if is_fn_expression?(collection_node, state_after_collection.nodes) do
transformed_node =
transform_to_lambda_expression(collection_node, state_after_collection.nodes)
final_state = %{
state_after_collection
| nodes: Map.put(state_after_collection.nodes, transformed_node.id, transformed_node)
}
{:ok, transformed_node.id, rest_after_collection, final_state}
else
# Not an fn expression, return as is
result
end
_error_or_other ->
# Propagate errors or other results from parse_collection
result
end
end
# Helper to check if an S-expression node is an 'fn' expression
defp is_fn_expression?(s_expr_node, nodes_map) do
if s_expr_node.ast_node_type == :s_expression && !Enum.empty?(s_expr_node.children) do
first_child_id = hd(s_expr_node.children)
first_child_node = Map.get(nodes_map, first_child_id)
first_child_node && first_child_node.ast_node_type == :symbol &&
first_child_node.name == "fn"
else
false
end
end
# Helper to transform a generic S-expression node (known to be an 'fn' form)
# into a :lambda_expression node.
defp transform_to_lambda_expression(s_expr_node, nodes_map) do
# s_expr_node.children = [fn_symbol_id, params_s_expr_id, body_form1_id, ...]
# Already checked
_fn_symbol_id = Enum.at(s_expr_node.children, 0)
if length(s_expr_node.children) < 2 do
%{s_expr_node | parsing_error: "Malformed 'fn' expression: missing parameters list."}
else
params_s_expr_id = Enum.at(s_expr_node.children, 1)
params_s_expr_node = Map.get(nodes_map, params_s_expr_id)
if !(params_s_expr_node && params_s_expr_node.ast_node_type == :s_expression) do
Map.put(
s_expr_node,
:parsing_error,
"Malformed 'fn' expression: parameters list is not an S-expression."
)
else
# Children of the parameters S-expression, e.g. for (fn ((a integer) (b atom) atom) ...),
# param_s_expr_children_ids would be IDs of [(a integer), (b atom), atom]
all_param_children_ids = Map.get(params_s_expr_node, :children, [])
{arg_spec_node_ids, return_type_spec_node_id} =
if Enum.empty?(all_param_children_ids) do
# Case: (fn () body) -> No args, nil (inferred) return type spec
{[], nil}
else
# Case: (fn (arg1 type1 ... ret_type) body)
# Last element is return type spec, rest are arg specs.
args = Enum.take(all_param_children_ids, length(all_param_children_ids) - 1)
ret_type_id = List.last(all_param_children_ids)
{args, ret_type_id}
end
# Validate arg_spec_node_ids: each must be a symbol or an S-expr (param_symbol type_spec)
all_arg_specs_valid =
Enum.all?(arg_spec_node_ids, fn arg_id ->
arg_node = Map.get(nodes_map, arg_id)
case arg_node do
# e.g. x
%{ast_node_type: :symbol} ->
true
# e.g. (x integer)
%{ast_node_type: :s_expression, children: s_children} ->
if length(s_children) == 2 do
param_sym_node = Map.get(nodes_map, hd(s_children))
type_spec_node = Map.get(nodes_map, hd(tl(s_children)))
param_sym_node && param_sym_node.ast_node_type == :symbol &&
type_spec_node &&
(type_spec_node.ast_node_type == :symbol ||
type_spec_node.ast_node_type == :s_expression)
else
# Not a valid (param_symbol type_spec) structure
false
end
# Not a symbol or valid S-expression for arg spec
_ ->
false
end
end)
# Validate return_type_spec_node_id: must be nil or a valid type specifier node
return_type_spec_valid =
if is_nil(return_type_spec_node_id) do
# Inferred return type is valid
true
else
ret_type_node = Map.get(nodes_map, return_type_spec_node_id)
ret_type_node &&
(ret_type_node.ast_node_type == :symbol ||
ret_type_node.ast_node_type == :s_expression)
end
if all_arg_specs_valid && return_type_spec_valid do
# Body starts after 'fn' and params_s_expr
body_node_ids = Enum.drop(s_expr_node.children, 2)
Map.merge(s_expr_node, %{
:ast_node_type => :lambda_expression,
:params_s_expr_id => params_s_expr_id,
:arg_spec_node_ids => arg_spec_node_ids,
:return_type_spec_node_id => return_type_spec_node_id,
:body_node_ids => body_node_ids
})
else
# Determine more specific error message
error_message =
cond do
!all_arg_specs_valid ->
"Malformed 'fn' expression: invalid argument specification(s)."
!return_type_spec_valid ->
"Malformed 'fn' expression: invalid return type specification."
# Generic fallback
true ->
"Malformed 'fn' expression."
end
Map.put(s_expr_node, :parsing_error, error_message)
end
end
end
end
defp parse_list_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
"[",
"]",
:list_expression,
"Unclosed list",
"Error parsing element in list. Content might be incomplete."
)
end
defp parse_map_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
# Opening token
"m{",
# Closing token
"}",
:map_expression,
"Unclosed map",
"Error parsing element in map. Content might be incomplete."
)
end
defp parse_tuple_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
"{",
"}",
:tuple_expression,
"Unclosed tuple",
"Error parsing element in tuple. Content might be incomplete."
)
end
defp parse_collection(
original_source_string,
source,
state,
parent_id,
open_char_str,
# Used by parse_collection_elements
close_char_str,
ast_node_type,
# Used by parse_collection_elements
unclosed_error_msg,
# Used by parse_collection_elements
element_error_msg
) do
# Consume opening token (e.g. '(', '[', 'm{')
collection_start_offset = state.offset
collection_start_line = state.line
collection_start_col = state.col
open_char_len = String.length(open_char_str)
{_opening_token, rest_after_opening_token} = String.split_at(source, open_char_len)
current_state = advance_pos(state, open_char_str)
collection_node_id = System.unique_integer([:monotonic, :positive])
prelim_collection_node = %{
id: collection_node_id,
type_id: nil,
parent_id: parent_id,
file: current_state.file_name,
# End TBD
location: [collection_start_offset, collection_start_line, collection_start_col, 0, 0, 0],
# TBD
raw_string: "",
ast_node_type: ast_node_type,
children: [],
parsing_error: nil
}
current_state_with_prelim_node = %{
current_state
| nodes: Map.put(current_state.nodes, collection_node_id, prelim_collection_node)
}
collection_start_pos_for_children =
{collection_start_offset, collection_start_line, collection_start_col}
# Pass all necessary params to the generalized element parser
result =
parse_collection_elements(
original_source_string,
rest_after_opening_token,
current_state_with_prelim_node,
collection_node_id,
[],
collection_start_pos_for_children,
# Parameters for generalization, passed from parse_collection's arguments:
# Used by parse_collection_elements
close_char_str,
# Used by parse_collection_elements
unclosed_error_msg,
# Passed to parse_collection_elements (might be unused there now)
element_error_msg
)
# Adapt result to {:ok, node_id, ...} or {:error_node, node_id, ...}
case result do
{:ok, returned_collection_node_id, rest, state_after_elements} ->
{:ok, returned_collection_node_id, rest, state_after_elements}
{:error, reason, rest, state_after_elements} ->
# The collection_node_id is the ID of the node that has the error.
# This 'reason' is typically for unclosed collections or fatal element errors.
{:error_node, collection_node_id, reason, rest, state_after_elements}
end
end
# Generalized from parse_s_expression_elements
defp parse_collection_elements(
original_source_string,
source,
state,
collection_node_id,
children_ids_acc,
collection_start_pos_tuple,
# New parameters for generalization:
# e.g., ")" or "]"
closing_char_str,
# e.g., "Unclosed S-expression"
unclosed_error_message,
# e.g., "Error parsing element in S-expression..."
# Now potentially unused, marked with underscore
element_error_message
) do
case skip_whitespace(source, state) do
{:eos, current_state_at_eos} ->
# Unclosed collection
collection_node = Map.get(current_state_at_eos.nodes, collection_node_id)
start_offset = elem(collection_start_pos_tuple, 0)
end_offset = current_state_at_eos.offset
actual_raw_string =
String.slice(original_source_string, start_offset, end_offset - start_offset)
updated_collection_node = %{
collection_node
| # Use generalized message
parsing_error: unclosed_error_message,
children: Enum.reverse(children_ids_acc),
location: [
start_offset,
elem(collection_start_pos_tuple, 1),
elem(collection_start_pos_tuple, 2),
end_offset,
current_state_at_eos.line,
current_state_at_eos.col
],
raw_string: actual_raw_string
}
final_state = %{
current_state_at_eos
| nodes:
Map.put(current_state_at_eos.nodes, collection_node_id, updated_collection_node)
}
# This error is for the collection itself being unclosed.
# The collection_node_id is implicitly the ID of this error node.
{:error, unclosed_error_message, "", final_state}
{:ok, remaining_source, current_state} ->
# Check if the remaining source starts with the closing token string
if String.starts_with?(remaining_source, closing_char_str) do
# End of collection
closing_char_len = String.length(closing_char_str)
{_closing_token, rest_after_closing_token} =
String.split_at(remaining_source, closing_char_len)
final_collection_state = advance_pos(current_state, closing_char_str)
collection_node = Map.get(final_collection_state.nodes, collection_node_id)
coll_final_start_offset = elem(collection_start_pos_tuple, 0)
coll_final_start_line = elem(collection_start_pos_tuple, 1)
coll_final_start_col = elem(collection_start_pos_tuple, 2)
coll_final_end_offset = final_collection_state.offset
coll_final_end_line = final_collection_state.line
coll_final_end_col = final_collection_state.col
actual_raw_string =
String.slice(
original_source_string,
coll_final_start_offset,
coll_final_end_offset - coll_final_start_offset
)
updated_collection_node = %{
collection_node
| children: Enum.reverse(children_ids_acc),
location: [
coll_final_start_offset,
coll_final_start_line,
coll_final_start_col,
coll_final_end_offset,
coll_final_end_line,
coll_final_end_col
],
raw_string: actual_raw_string
}
final_state_with_collection = %{
final_collection_state
| nodes:
Map.put(
final_collection_state.nodes,
collection_node_id,
updated_collection_node
)
}
{:ok, collection_node_id, rest_after_closing_token, final_state_with_collection}
else
# Parse an element
case parse_datum(
original_source_string,
remaining_source,
current_state,
# parent_id for the element
collection_node_id
) do
{:ok, child_node_id, next_source_after_elem, next_state_after_elem} ->
parse_collection_elements(
original_source_string,
next_source_after_elem,
next_state_after_elem,
collection_node_id,
# Add successful child's ID
[child_node_id | children_ids_acc],
collection_start_pos_tuple,
closing_char_str,
unclosed_error_message,
# Pass through, though may be unused
element_error_message
)
{:error_node, child_error_node_id, _child_reason, next_source_after_elem,
next_state_after_elem} ->
# An error node was created for the child element. Add its ID and continue.
parse_collection_elements(
original_source_string,
next_source_after_elem,
next_state_after_elem,
collection_node_id,
# Add error child's ID
[child_error_node_id | children_ids_acc],
collection_start_pos_tuple,
closing_char_str,
unclosed_error_message,
# Pass through
element_error_message
)
# No other return types are expected from parse_datum if it always creates a node on error
# or succeeds. If parse_datum could fail without creating a node and without consuming input,
# that would be an issue here, potentially leading to infinite loops if not handled.
# The current changes aim for parse_datum to always return :ok or :error_node.
end
end
end
end
# --- Utility Functions ---
# Note: The `extra_fields` argument was changed from optional to required
# as the default value was never used according to compiler warnings.
defp add_node(state, parent_id, location, raw_string, ast_node_type, extra_fields) do
node_id = System.unique_integer([:monotonic, :positive])
node =
%{
id: node_id,
type_id: nil,
parent_id: parent_id,
file: state.file_name,
# [start_offset, start_line, start_col, end_offset, end_line, end_col]
location: location,
raw_string: raw_string,
ast_node_type: ast_node_type
}
|> Map.merge(extra_fields)
{node_id, %{state | nodes: Map.put(state.nodes, node_id, node)}}
end
defp skip_whitespace(source, state = %__MODULE__{offset: o, line: l, col: c}) do
whitespace_match = Regex.run(~r/^\s+/, source)
if whitespace_match do
[ws | _] = whitespace_match
new_offset = o + String.length(ws)
{new_line, new_col} = calculate_new_line_col(ws, l, c)
remaining_source = String.slice(source, String.length(ws)..-1//1)
{:ok, remaining_source, %{state | offset: new_offset, line: new_line, col: new_col}}
else
if String.length(source) == 0 do
{:eos, state}
else
# No leading whitespace
{:ok, source, state}
end
end
end
defp calculate_new_line_col(string_segment, start_line, start_col) do
string_segment
|> String.codepoints()
|> Enum.reduce({start_line, start_col}, fn char, {line, col} ->
if char == "\n" do
{line + 1, 1}
else
{line, col + 1}
end
end)
end
defp advance_pos(state = %__MODULE__{offset: o, line: l, col: c}, consumed_string) do
new_offset = o + String.length(consumed_string)
{new_line, new_col} = calculate_new_line_col(consumed_string, l, c)
%{state | offset: new_offset, line: new_line, col: new_col}
end
end