1010 lines
35 KiB
Elixir
1010 lines
35 KiB
Elixir
defmodule Til.Parser do
|
|
@moduledoc """
|
|
Parser for the Tilly Lisp dialect.
|
|
It transforms source code into a collection of Node Maps.
|
|
"""
|
|
|
|
# Represents the current parsing position
|
|
defstruct offset: 0, line: 1, col: 1, file_name: "unknown", nodes: %{}
|
|
|
|
@doc """
|
|
Parses a source string into a map of AST nodes.
|
|
"""
|
|
def parse(source_string, file_name \\ "unknown") do
|
|
file_node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
# Initial location for the file node (starts at the beginning)
|
|
file_start_offset = 0
|
|
file_start_line = 1
|
|
file_start_col = 1
|
|
|
|
# End location and raw_string will be finalized after parsing all content
|
|
prelim_file_node = %{
|
|
id: file_node_id,
|
|
type_id: nil,
|
|
# File node is the root
|
|
parent_id: nil,
|
|
file: file_name,
|
|
# End TBD
|
|
location: [file_start_offset, file_start_line, file_start_col, 0, 0, 0],
|
|
# TBD
|
|
raw_string: "",
|
|
ast_node_type: :file,
|
|
# TBD
|
|
children: [],
|
|
parsing_error: nil
|
|
}
|
|
|
|
initial_state = %__MODULE__{
|
|
file_name: file_name,
|
|
# Add prelim file node
|
|
nodes: %{file_node_id => prelim_file_node},
|
|
# Initial state offset should be 0 for the file
|
|
offset: 0,
|
|
# Initial state line should be 1
|
|
line: 1,
|
|
# Initial state col should be 1
|
|
col: 1
|
|
}
|
|
|
|
# Pass original_source_string for raw_string extraction, and file_node_id as parent for top-level exprs
|
|
final_state_after_expressions =
|
|
parse_all_expressions(source_string, source_string, initial_state, file_node_id)
|
|
|
|
# Finalize the file node
|
|
# Calculate end position of the entire source string
|
|
{file_end_line, file_end_col} = calculate_new_line_col(source_string, 1, 1)
|
|
# Offset is 0-indexed, length is the count of characters, so end_offset is length.
|
|
file_end_offset = String.length(source_string)
|
|
|
|
# Collect children of the file node
|
|
file_children_ids =
|
|
final_state_after_expressions.nodes
|
|
|> Map.values()
|
|
|> Enum.filter(&(&1.parent_id == file_node_id))
|
|
# Sort by start offset to maintain order of appearance in the source
|
|
|> Enum.sort_by(fn node -> hd(node.location) end)
|
|
|> Enum.map(& &1.id)
|
|
|
|
updated_file_node =
|
|
final_state_after_expressions.nodes
|
|
|> Map.get(file_node_id)
|
|
|> Map.merge(%{
|
|
location: [
|
|
file_start_offset,
|
|
file_start_line,
|
|
file_start_col,
|
|
file_end_offset,
|
|
file_end_line,
|
|
file_end_col
|
|
],
|
|
# The entire source is the raw string of the file node
|
|
raw_string: source_string,
|
|
children: file_children_ids
|
|
})
|
|
|
|
final_nodes =
|
|
Map.put(final_state_after_expressions.nodes, file_node_id, updated_file_node)
|
|
|
|
{:ok, final_nodes}
|
|
end
|
|
|
|
# --- Main Parsing Logic ---
|
|
|
|
# original_source_string is the complete initial source, source_string is the current remainder
|
|
# parent_id_for_top_level_expressions is the ID of the node that top-level expressions should be parented to (e.g., the :file node)
|
|
defp parse_all_expressions(
|
|
original_source_string,
|
|
source_string,
|
|
state,
|
|
parent_id_for_top_level_expressions
|
|
) do
|
|
case skip_whitespace(source_string, state) do
|
|
{:eos, final_state} ->
|
|
final_state
|
|
|
|
{:ok, remaining_source, current_state} ->
|
|
if remaining_source == "" do
|
|
# All content parsed, nothing left after skipping whitespace.
|
|
# This is a successful termination of parsing for the current branch.
|
|
current_state
|
|
else
|
|
# There's actual content to parse.
|
|
case parse_datum(
|
|
original_source_string,
|
|
remaining_source,
|
|
current_state,
|
|
parent_id_for_top_level_expressions
|
|
) do
|
|
{:ok, _node_id, next_source, next_state} ->
|
|
parse_all_expressions(
|
|
original_source_string,
|
|
next_source,
|
|
next_state,
|
|
parent_id_for_top_level_expressions
|
|
)
|
|
|
|
{:error_node, _node_id, _reason, next_source, next_state} ->
|
|
# An error node was created by parse_datum.
|
|
# Input was consumed. Continue parsing from next_source.
|
|
parse_all_expressions(
|
|
original_source_string,
|
|
next_source,
|
|
next_state,
|
|
parent_id_for_top_level_expressions
|
|
)
|
|
|
|
# NOTE: This relies on parse_datum and its components (like create_error_node_and_advance)
|
|
# to always consume input if source_string is not empty. If parse_datum could return
|
|
# :error_node without consuming input on a non-empty string, an infinite loop is possible.
|
|
# Current implementation of create_error_node_and_advance consumes 1 char.
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
# Parses a single datum: an atom (integer, symbol) or a list.
|
|
defp parse_datum(original_source_string, source, state, parent_id) do
|
|
# Peek for multi-character tokens first
|
|
cond do
|
|
String.starts_with?(source, "m{") ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_map_expression(original_source_string, source, state, parent_id)
|
|
|
|
# Fallback to single character dispatch
|
|
true ->
|
|
char = String.first(source)
|
|
|
|
cond do
|
|
char == "(" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_s_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == ")" ->
|
|
# Unexpected closing parenthesis, consume 1 char for the error token ')'
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ')'")
|
|
|
|
char == "[" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_list_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == "]" ->
|
|
# Unexpected closing square bracket, consume 1 char for the error token ']'
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ']'")
|
|
|
|
# For tuples
|
|
char == "{" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_tuple_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == "}" ->
|
|
# Unexpected closing curly brace
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected '}'")
|
|
|
|
char == "'" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_string_datum(original_source_string, source, state, parent_id)
|
|
|
|
char == ":" ->
|
|
# If the first char is ':', try to parse as an atom like :foo
|
|
case parse_atom_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
|
|
{:error, :not_atom} ->
|
|
# Failed to parse as a specific atom (e.g. ":foo").
|
|
# It could be a symbol that starts with ':' (e.g. if we allow ":" as a symbol).
|
|
# Fallback to general symbol parsing. Integer parsing won't match if it starts with ':'.
|
|
case parse_symbol_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
|
|
{:error, :not_symbol} ->
|
|
# If it started with ':' but wasn't a valid atom and also not a valid symbol
|
|
create_error_node_and_advance(
|
|
source,
|
|
state,
|
|
parent_id,
|
|
1,
|
|
"Unknown token starting with ':'"
|
|
)
|
|
end
|
|
end
|
|
|
|
true ->
|
|
# Default case for other characters
|
|
# Try parsing as an integer first
|
|
case parse_integer_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
|
|
{:error, :not_integer} ->
|
|
# Not an integer, try parsing as a symbol
|
|
case parse_symbol_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
|
|
{:error, :not_symbol} ->
|
|
# Not a symbol either. Consume 1 char for the unknown token.
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token")
|
|
end
|
|
end
|
|
end
|
|
|
|
# end inner cond
|
|
end
|
|
|
|
# end outer cond
|
|
end
|
|
|
|
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
|
|
|
|
defp parse_string_datum(_original_source_string, source, state, parent_id) do
|
|
# state is before consuming "'"
|
|
initial_state_for_token = state
|
|
strip_indent = initial_state_for_token.col - 1
|
|
|
|
# Consume opening "'"
|
|
{opening_tick, source_after_opening_tick} = String.split_at(source, 1)
|
|
|
|
case :binary.match(source_after_opening_tick, "'") do
|
|
:nomatch ->
|
|
# Unclosed string
|
|
content_segment = source_after_opening_tick
|
|
raw_token = opening_tick <> content_segment
|
|
|
|
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
|
|
|
|
location = [
|
|
initial_state_for_token.offset,
|
|
initial_state_for_token.line,
|
|
initial_state_for_token.col,
|
|
state_at_node_end.offset,
|
|
state_at_node_end.line,
|
|
state_at_node_end.col
|
|
]
|
|
|
|
processed_value = process_string_content(content_segment, strip_indent)
|
|
|
|
{node_id, state_with_error_node} =
|
|
add_node(
|
|
initial_state_for_token,
|
|
parent_id,
|
|
location,
|
|
raw_token,
|
|
:literal_string,
|
|
%{value: processed_value, parsing_error: "Unclosed string literal"}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_error_node
|
|
| offset: state_at_node_end.offset,
|
|
line: state_at_node_end.line,
|
|
col: state_at_node_end.col
|
|
}
|
|
|
|
{:error_node, node_id, "Unclosed string literal", "", final_state}
|
|
|
|
# _tick_length will be 1 for "`"
|
|
{idx_closing_tick_in_segment, _tick_length} ->
|
|
content_segment =
|
|
String.slice(source_after_opening_tick, 0, idx_closing_tick_in_segment)
|
|
|
|
closing_tick = "'"
|
|
raw_token = opening_tick <> content_segment <> closing_tick
|
|
|
|
rest_of_source =
|
|
String.slice(source_after_opening_tick, (idx_closing_tick_in_segment + 1)..-1//1)
|
|
|
|
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
|
|
|
|
location = [
|
|
initial_state_for_token.offset,
|
|
initial_state_for_token.line,
|
|
initial_state_for_token.col,
|
|
state_at_node_end.offset,
|
|
state_at_node_end.line,
|
|
state_at_node_end.col
|
|
]
|
|
|
|
processed_value = process_string_content(content_segment, strip_indent)
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(
|
|
initial_state_for_token,
|
|
parent_id,
|
|
location,
|
|
raw_token,
|
|
:literal_string,
|
|
%{value: processed_value}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: state_at_node_end.offset,
|
|
line: state_at_node_end.line,
|
|
col: state_at_node_end.col
|
|
}
|
|
|
|
{:ok, new_node_id, rest_of_source, final_state}
|
|
end
|
|
end
|
|
|
|
defp process_string_content(content_str, strip_indent) when strip_indent >= 0 do
|
|
lines = String.split(content_str, "\n", trim: false)
|
|
# Will always exist, even for empty content_str -> ""
|
|
first_line = List.first(lines)
|
|
|
|
rest_lines =
|
|
if length(lines) > 1 do
|
|
List.delete_at(lines, 0)
|
|
else
|
|
[]
|
|
end
|
|
|
|
processed_rest_lines =
|
|
Enum.map(rest_lines, fn line ->
|
|
current_leading_spaces_count =
|
|
Regex.run(~r/^(\s*)/, line)
|
|
|> List.first()
|
|
|> String.length()
|
|
|
|
spaces_to_remove = min(current_leading_spaces_count, strip_indent)
|
|
String.slice(line, spaces_to_remove..-1//1)
|
|
end)
|
|
|
|
all_processed_lines = [first_line | processed_rest_lines]
|
|
Enum.join(all_processed_lines, "\n")
|
|
end
|
|
|
|
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
|
|
|
|
# (parse_string_datum remains unchanged)
|
|
|
|
defp parse_atom_datum(source, state, parent_id) do
|
|
# Atom is a colon followed by one or more non-delimiter characters.
|
|
# Delimiters are whitespace, (, ), [, ], {, }.
|
|
# The colon itself is part of the atom's raw string.
|
|
# The `atom_name_part` is what comes after the colon.
|
|
case Regex.run(~r/^:([^\s\(\)\[\]\{\}]+)/, source) do
|
|
# raw_atom_str is like ":foo", atom_name_part is "foo"
|
|
[raw_atom_str, atom_name_part] ->
|
|
# The regex [^...]+ ensures atom_name_part is not empty.
|
|
rest_after_atom = String.slice(source, String.length(raw_atom_str)..-1//1)
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_atom_str)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
# Convert the name part (e.g., "foo") to an Elixir atom (e.g., :foo)
|
|
atom_value = String.to_atom(atom_name_part)
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(
|
|
state,
|
|
parent_id,
|
|
location,
|
|
raw_atom_str,
|
|
:literal_atom,
|
|
%{value: atom_value}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
|
|
{:ok, new_node_id, rest_after_atom, final_state}
|
|
|
|
# No match (nil or list that doesn't conform, e.g., just ":" or ": followed by space/delimiter")
|
|
_ ->
|
|
{:error, :not_atom}
|
|
end
|
|
end
|
|
|
|
defp parse_integer_datum(source, state, parent_id) do
|
|
case Integer.parse(source) do
|
|
{int_val, rest_after_int} ->
|
|
raw_int =
|
|
String.slice(source, 0, String.length(source) - String.length(rest_after_int))
|
|
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_int)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(state, parent_id, location, raw_int, :literal_integer, %{value: int_val})
|
|
|
|
# Update state to reflect consumed token
|
|
final_state = %{state_with_node | offset: end_offset, line: end_line, col: end_col}
|
|
{:ok, new_node_id, rest_after_int, final_state}
|
|
|
|
:error ->
|
|
# Indicates failure, source and state are unchanged by this attempt
|
|
{:error, :not_integer}
|
|
end
|
|
end
|
|
|
|
defp parse_symbol_datum(source, state, parent_id) do
|
|
# Regex excludes common delimiters. `m{` is handled before symbol parsing.
|
|
case Regex.run(~r/^([^\s\(\)\[\]\{\}]+)/, source) do
|
|
[raw_symbol | _] ->
|
|
rest_after_symbol = String.slice(source, String.length(raw_symbol)..-1//1)
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_symbol)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(state, parent_id, location, raw_symbol, :symbol, %{name: raw_symbol})
|
|
|
|
# Update state to reflect consumed token
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
|
|
{:ok, new_node_id, rest_after_symbol, final_state}
|
|
|
|
nil ->
|
|
# Indicates failure, source and state are unchanged by this attempt
|
|
{:error, :not_symbol}
|
|
end
|
|
end
|
|
|
|
defp create_error_node_and_advance(
|
|
source_for_token,
|
|
state_before_token,
|
|
parent_id,
|
|
num_chars_for_token,
|
|
error_message
|
|
) do
|
|
{raw_token, rest_of_source} = String.split_at(source_for_token, num_chars_for_token)
|
|
|
|
start_offset = state_before_token.offset
|
|
start_line = state_before_token.line
|
|
start_col = state_before_token.col
|
|
|
|
state_after_token_consumed = advance_pos(state_before_token, raw_token)
|
|
end_offset = state_after_token_consumed.offset
|
|
end_line = state_after_token_consumed.line
|
|
end_col = state_after_token_consumed.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{error_node_id, state_with_error_node} =
|
|
add_node(state_before_token, parent_id, location, raw_token, :unknown, %{
|
|
parsing_error: error_message
|
|
})
|
|
|
|
# The state for further parsing must reflect the consumed token's position and include the new error node
|
|
final_error_state = %{
|
|
state_with_error_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
|
|
{:error_node, error_node_id, error_message, rest_of_source, final_error_state}
|
|
end
|
|
|
|
defp parse_s_expression(original_source_string, source, state, parent_id) do
|
|
# Standard S-expression parsing via parse_collection
|
|
result =
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"(",
|
|
")",
|
|
:s_expression,
|
|
"Unclosed S-expression",
|
|
"Error parsing element in S-expression. Content might be incomplete."
|
|
)
|
|
|
|
# After parsing, check if it's an 'fn' expression
|
|
case result do
|
|
{:ok, collection_node_id, rest_after_collection, state_after_collection} ->
|
|
collection_node = Map.get(state_after_collection.nodes, collection_node_id)
|
|
|
|
if is_fn_expression?(collection_node, state_after_collection.nodes) do
|
|
transformed_node =
|
|
transform_to_lambda_expression(collection_node, state_after_collection.nodes)
|
|
|
|
final_state = %{
|
|
state_after_collection
|
|
| nodes: Map.put(state_after_collection.nodes, transformed_node.id, transformed_node)
|
|
}
|
|
|
|
{:ok, transformed_node.id, rest_after_collection, final_state}
|
|
else
|
|
# Not an fn expression, return as is
|
|
result
|
|
end
|
|
|
|
_error_or_other ->
|
|
# Propagate errors or other results from parse_collection
|
|
result
|
|
end
|
|
end
|
|
|
|
# Helper to check if an S-expression node is an 'fn' expression
|
|
defp is_fn_expression?(s_expr_node, nodes_map) do
|
|
if s_expr_node.ast_node_type == :s_expression && !Enum.empty?(s_expr_node.children) do
|
|
first_child_id = hd(s_expr_node.children)
|
|
first_child_node = Map.get(nodes_map, first_child_id)
|
|
|
|
first_child_node && first_child_node.ast_node_type == :symbol &&
|
|
first_child_node.name == "fn"
|
|
else
|
|
false
|
|
end
|
|
end
|
|
|
|
# Helper to transform a generic S-expression node (known to be an 'fn' form)
|
|
# into a :lambda_expression node.
|
|
defp transform_to_lambda_expression(s_expr_node, nodes_map) do
|
|
# s_expr_node.children = [fn_symbol_id, params_s_expr_id, body_form1_id, ...]
|
|
# Already checked
|
|
_fn_symbol_id = Enum.at(s_expr_node.children, 0)
|
|
|
|
if length(s_expr_node.children) < 2 do
|
|
%{s_expr_node | parsing_error: "Malformed 'fn' expression: missing parameters list."}
|
|
else
|
|
params_s_expr_id = Enum.at(s_expr_node.children, 1)
|
|
params_s_expr_node = Map.get(nodes_map, params_s_expr_id)
|
|
|
|
if !(params_s_expr_node && params_s_expr_node.ast_node_type == :s_expression) do
|
|
Map.put(
|
|
s_expr_node,
|
|
:parsing_error,
|
|
"Malformed 'fn' expression: parameters list is not an S-expression."
|
|
)
|
|
else
|
|
# Children of the parameters S-expression, e.g. for (fn ((a integer) (b atom) atom) ...),
|
|
# param_s_expr_children_ids would be IDs of [(a integer), (b atom), atom]
|
|
all_param_children_ids = Map.get(params_s_expr_node, :children, [])
|
|
|
|
{arg_spec_node_ids, return_type_spec_node_id} =
|
|
if Enum.empty?(all_param_children_ids) do
|
|
# Case: (fn () body) -> No args, nil (inferred) return type spec
|
|
{[], nil}
|
|
else
|
|
# Case: (fn (arg1 type1 ... ret_type) body)
|
|
# Last element is return type spec, rest are arg specs.
|
|
args = Enum.take(all_param_children_ids, length(all_param_children_ids) - 1)
|
|
ret_type_id = List.last(all_param_children_ids)
|
|
{args, ret_type_id}
|
|
end
|
|
|
|
# Validate arg_spec_node_ids: each must be a symbol or an S-expr (param_symbol type_spec)
|
|
all_arg_specs_valid =
|
|
Enum.all?(arg_spec_node_ids, fn arg_id ->
|
|
arg_node = Map.get(nodes_map, arg_id)
|
|
|
|
case arg_node do
|
|
# e.g. x
|
|
%{ast_node_type: :symbol} ->
|
|
true
|
|
|
|
# e.g. (x integer)
|
|
%{ast_node_type: :s_expression, children: s_children} ->
|
|
if length(s_children) == 2 do
|
|
param_sym_node = Map.get(nodes_map, hd(s_children))
|
|
type_spec_node = Map.get(nodes_map, hd(tl(s_children)))
|
|
|
|
param_sym_node && param_sym_node.ast_node_type == :symbol &&
|
|
type_spec_node &&
|
|
(type_spec_node.ast_node_type == :symbol ||
|
|
type_spec_node.ast_node_type == :s_expression)
|
|
else
|
|
# Not a valid (param_symbol type_spec) structure
|
|
false
|
|
end
|
|
|
|
# Not a symbol or valid S-expression for arg spec
|
|
_ ->
|
|
false
|
|
end
|
|
end)
|
|
|
|
# Validate return_type_spec_node_id: must be nil or a valid type specifier node
|
|
return_type_spec_valid =
|
|
if is_nil(return_type_spec_node_id) do
|
|
# Inferred return type is valid
|
|
true
|
|
else
|
|
ret_type_node = Map.get(nodes_map, return_type_spec_node_id)
|
|
|
|
ret_type_node &&
|
|
(ret_type_node.ast_node_type == :symbol ||
|
|
ret_type_node.ast_node_type == :s_expression)
|
|
end
|
|
|
|
if all_arg_specs_valid && return_type_spec_valid do
|
|
# Body starts after 'fn' and params_s_expr
|
|
body_node_ids = Enum.drop(s_expr_node.children, 2)
|
|
|
|
Map.merge(s_expr_node, %{
|
|
:ast_node_type => :lambda_expression,
|
|
:params_s_expr_id => params_s_expr_id,
|
|
:arg_spec_node_ids => arg_spec_node_ids,
|
|
:return_type_spec_node_id => return_type_spec_node_id,
|
|
:body_node_ids => body_node_ids
|
|
})
|
|
else
|
|
# Determine more specific error message
|
|
error_message =
|
|
cond do
|
|
!all_arg_specs_valid ->
|
|
"Malformed 'fn' expression: invalid argument specification(s)."
|
|
|
|
!return_type_spec_valid ->
|
|
"Malformed 'fn' expression: invalid return type specification."
|
|
|
|
# Generic fallback
|
|
true ->
|
|
"Malformed 'fn' expression."
|
|
end
|
|
|
|
Map.put(s_expr_node, :parsing_error, error_message)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
defp parse_list_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"[",
|
|
"]",
|
|
:list_expression,
|
|
"Unclosed list",
|
|
"Error parsing element in list. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_map_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
# Opening token
|
|
"m{",
|
|
# Closing token
|
|
"}",
|
|
:map_expression,
|
|
"Unclosed map",
|
|
"Error parsing element in map. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_tuple_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"{",
|
|
"}",
|
|
:tuple_expression,
|
|
"Unclosed tuple",
|
|
"Error parsing element in tuple. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
open_char_str,
|
|
# Used by parse_collection_elements
|
|
close_char_str,
|
|
ast_node_type,
|
|
# Used by parse_collection_elements
|
|
unclosed_error_msg,
|
|
# Used by parse_collection_elements
|
|
element_error_msg
|
|
) do
|
|
# Consume opening token (e.g. '(', '[', 'm{')
|
|
collection_start_offset = state.offset
|
|
collection_start_line = state.line
|
|
collection_start_col = state.col
|
|
open_char_len = String.length(open_char_str)
|
|
{_opening_token, rest_after_opening_token} = String.split_at(source, open_char_len)
|
|
current_state = advance_pos(state, open_char_str)
|
|
|
|
collection_node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
prelim_collection_node = %{
|
|
id: collection_node_id,
|
|
type_id: nil,
|
|
parent_id: parent_id,
|
|
file: current_state.file_name,
|
|
# End TBD
|
|
location: [collection_start_offset, collection_start_line, collection_start_col, 0, 0, 0],
|
|
# TBD
|
|
raw_string: "",
|
|
ast_node_type: ast_node_type,
|
|
children: [],
|
|
parsing_error: nil
|
|
}
|
|
|
|
current_state_with_prelim_node = %{
|
|
current_state
|
|
| nodes: Map.put(current_state.nodes, collection_node_id, prelim_collection_node)
|
|
}
|
|
|
|
collection_start_pos_for_children =
|
|
{collection_start_offset, collection_start_line, collection_start_col}
|
|
|
|
# Pass all necessary params to the generalized element parser
|
|
result =
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
rest_after_opening_token,
|
|
current_state_with_prelim_node,
|
|
collection_node_id,
|
|
[],
|
|
collection_start_pos_for_children,
|
|
# Parameters for generalization, passed from parse_collection's arguments:
|
|
# Used by parse_collection_elements
|
|
close_char_str,
|
|
# Used by parse_collection_elements
|
|
unclosed_error_msg,
|
|
# Passed to parse_collection_elements (might be unused there now)
|
|
element_error_msg
|
|
)
|
|
|
|
# Adapt result to {:ok, node_id, ...} or {:error_node, node_id, ...}
|
|
case result do
|
|
{:ok, returned_collection_node_id, rest, state_after_elements} ->
|
|
{:ok, returned_collection_node_id, rest, state_after_elements}
|
|
|
|
{:error, reason, rest, state_after_elements} ->
|
|
# The collection_node_id is the ID of the node that has the error.
|
|
# This 'reason' is typically for unclosed collections or fatal element errors.
|
|
{:error_node, collection_node_id, reason, rest, state_after_elements}
|
|
end
|
|
end
|
|
|
|
# Generalized from parse_s_expression_elements
|
|
defp parse_collection_elements(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
collection_node_id,
|
|
children_ids_acc,
|
|
collection_start_pos_tuple,
|
|
# New parameters for generalization:
|
|
# e.g., ")" or "]"
|
|
closing_char_str,
|
|
# e.g., "Unclosed S-expression"
|
|
unclosed_error_message,
|
|
# e.g., "Error parsing element in S-expression..."
|
|
# Now potentially unused, marked with underscore
|
|
element_error_message
|
|
) do
|
|
case skip_whitespace(source, state) do
|
|
{:eos, current_state_at_eos} ->
|
|
# Unclosed collection
|
|
collection_node = Map.get(current_state_at_eos.nodes, collection_node_id)
|
|
start_offset = elem(collection_start_pos_tuple, 0)
|
|
end_offset = current_state_at_eos.offset
|
|
|
|
actual_raw_string =
|
|
String.slice(original_source_string, start_offset, end_offset - start_offset)
|
|
|
|
updated_collection_node = %{
|
|
collection_node
|
|
| # Use generalized message
|
|
parsing_error: unclosed_error_message,
|
|
children: Enum.reverse(children_ids_acc),
|
|
location: [
|
|
start_offset,
|
|
elem(collection_start_pos_tuple, 1),
|
|
elem(collection_start_pos_tuple, 2),
|
|
end_offset,
|
|
current_state_at_eos.line,
|
|
current_state_at_eos.col
|
|
],
|
|
raw_string: actual_raw_string
|
|
}
|
|
|
|
final_state = %{
|
|
current_state_at_eos
|
|
| nodes:
|
|
Map.put(current_state_at_eos.nodes, collection_node_id, updated_collection_node)
|
|
}
|
|
|
|
# This error is for the collection itself being unclosed.
|
|
# The collection_node_id is implicitly the ID of this error node.
|
|
{:error, unclosed_error_message, "", final_state}
|
|
|
|
{:ok, remaining_source, current_state} ->
|
|
# Check if the remaining source starts with the closing token string
|
|
if String.starts_with?(remaining_source, closing_char_str) do
|
|
# End of collection
|
|
closing_char_len = String.length(closing_char_str)
|
|
|
|
{_closing_token, rest_after_closing_token} =
|
|
String.split_at(remaining_source, closing_char_len)
|
|
|
|
final_collection_state = advance_pos(current_state, closing_char_str)
|
|
collection_node = Map.get(final_collection_state.nodes, collection_node_id)
|
|
|
|
coll_final_start_offset = elem(collection_start_pos_tuple, 0)
|
|
coll_final_start_line = elem(collection_start_pos_tuple, 1)
|
|
coll_final_start_col = elem(collection_start_pos_tuple, 2)
|
|
coll_final_end_offset = final_collection_state.offset
|
|
coll_final_end_line = final_collection_state.line
|
|
coll_final_end_col = final_collection_state.col
|
|
|
|
actual_raw_string =
|
|
String.slice(
|
|
original_source_string,
|
|
coll_final_start_offset,
|
|
coll_final_end_offset - coll_final_start_offset
|
|
)
|
|
|
|
updated_collection_node = %{
|
|
collection_node
|
|
| children: Enum.reverse(children_ids_acc),
|
|
location: [
|
|
coll_final_start_offset,
|
|
coll_final_start_line,
|
|
coll_final_start_col,
|
|
coll_final_end_offset,
|
|
coll_final_end_line,
|
|
coll_final_end_col
|
|
],
|
|
raw_string: actual_raw_string
|
|
}
|
|
|
|
final_state_with_collection = %{
|
|
final_collection_state
|
|
| nodes:
|
|
Map.put(
|
|
final_collection_state.nodes,
|
|
collection_node_id,
|
|
updated_collection_node
|
|
)
|
|
}
|
|
|
|
{:ok, collection_node_id, rest_after_closing_token, final_state_with_collection}
|
|
else
|
|
# Parse an element
|
|
case parse_datum(
|
|
original_source_string,
|
|
remaining_source,
|
|
current_state,
|
|
# parent_id for the element
|
|
collection_node_id
|
|
) do
|
|
{:ok, child_node_id, next_source_after_elem, next_state_after_elem} ->
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
next_source_after_elem,
|
|
next_state_after_elem,
|
|
collection_node_id,
|
|
# Add successful child's ID
|
|
[child_node_id | children_ids_acc],
|
|
collection_start_pos_tuple,
|
|
closing_char_str,
|
|
unclosed_error_message,
|
|
# Pass through, though may be unused
|
|
element_error_message
|
|
)
|
|
|
|
{:error_node, child_error_node_id, _child_reason, next_source_after_elem,
|
|
next_state_after_elem} ->
|
|
# An error node was created for the child element. Add its ID and continue.
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
next_source_after_elem,
|
|
next_state_after_elem,
|
|
collection_node_id,
|
|
# Add error child's ID
|
|
[child_error_node_id | children_ids_acc],
|
|
collection_start_pos_tuple,
|
|
closing_char_str,
|
|
unclosed_error_message,
|
|
# Pass through
|
|
element_error_message
|
|
)
|
|
|
|
# No other return types are expected from parse_datum if it always creates a node on error
|
|
# or succeeds. If parse_datum could fail without creating a node and without consuming input,
|
|
# that would be an issue here, potentially leading to infinite loops if not handled.
|
|
# The current changes aim for parse_datum to always return :ok or :error_node.
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
# --- Utility Functions ---
|
|
|
|
# Note: The `extra_fields` argument was changed from optional to required
|
|
# as the default value was never used according to compiler warnings.
|
|
defp add_node(state, parent_id, location, raw_string, ast_node_type, extra_fields) do
|
|
node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
node =
|
|
%{
|
|
id: node_id,
|
|
type_id: nil,
|
|
parent_id: parent_id,
|
|
file: state.file_name,
|
|
# [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
location: location,
|
|
raw_string: raw_string,
|
|
ast_node_type: ast_node_type
|
|
}
|
|
|> Map.merge(extra_fields)
|
|
|
|
{node_id, %{state | nodes: Map.put(state.nodes, node_id, node)}}
|
|
end
|
|
|
|
defp skip_whitespace(source, state = %__MODULE__{offset: o, line: l, col: c}) do
|
|
whitespace_match = Regex.run(~r/^\s+/, source)
|
|
|
|
if whitespace_match do
|
|
[ws | _] = whitespace_match
|
|
new_offset = o + String.length(ws)
|
|
{new_line, new_col} = calculate_new_line_col(ws, l, c)
|
|
remaining_source = String.slice(source, String.length(ws)..-1//1)
|
|
{:ok, remaining_source, %{state | offset: new_offset, line: new_line, col: new_col}}
|
|
else
|
|
if String.length(source) == 0 do
|
|
{:eos, state}
|
|
else
|
|
# No leading whitespace
|
|
{:ok, source, state}
|
|
end
|
|
end
|
|
end
|
|
|
|
defp calculate_new_line_col(string_segment, start_line, start_col) do
|
|
string_segment
|
|
|> String.codepoints()
|
|
|> Enum.reduce({start_line, start_col}, fn char, {line, col} ->
|
|
if char == "\n" do
|
|
{line + 1, 1}
|
|
else
|
|
{line, col + 1}
|
|
end
|
|
end)
|
|
end
|
|
|
|
defp advance_pos(state = %__MODULE__{offset: o, line: l, col: c}, consumed_string) do
|
|
new_offset = o + String.length(consumed_string)
|
|
{new_line, new_col} = calculate_new_line_col(consumed_string, l, c)
|
|
%{state | offset: new_offset, line: new_line, col: new_col}
|
|
end
|
|
end
|