From 53825b9abace7225d52458c9e0c04aaa4b934ff9 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Wed, 15 Mar 2023 14:50:40 +0100 Subject: [PATCH 1/6] fix(emqx_bridge): propagate connection error to resource status --- apps/emqx_bridge/i18n/emqx_bridge_schema.conf | 11 ++ apps/emqx_bridge/src/emqx_bridge_api.erl | 102 +++++++++-------- .../src/schema/emqx_bridge_schema.erl | 14 ++- .../test/emqx_bridge_api_SUITE.erl | 107 +++++++++++------- apps/emqx_resource/include/emqx_resource.hrl | 1 + .../src/emqx_resource_manager.erl | 1 + changes/ce/fix-10145.en.md | 1 + 7 files changed, 152 insertions(+), 85 deletions(-) create mode 100644 changes/ce/fix-10145.en.md diff --git a/apps/emqx_bridge/i18n/emqx_bridge_schema.conf b/apps/emqx_bridge/i18n/emqx_bridge_schema.conf index 901f25455..de4ceb0d5 100644 --- a/apps/emqx_bridge/i18n/emqx_bridge_schema.conf +++ b/apps/emqx_bridge/i18n/emqx_bridge_schema.conf @@ -54,6 +54,17 @@ emqx_bridge_schema { } } + desc_status_reason { + desc { + en: "This is the reason given in case a bridge is failing to connect." + zh: "桥接连接失败的原因。" + } + label: { + en: "Failure reason" + zh: "失败原因" + } + } + desc_node_status { desc { en: """The status of the bridge for each node. diff --git a/apps/emqx_bridge/src/emqx_bridge_api.erl b/apps/emqx_bridge/src/emqx_bridge_api.erl index 8ac3e476a..c0e69ce83 100644 --- a/apps/emqx_bridge/src/emqx_bridge_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_api.erl @@ -748,7 +748,7 @@ pick_bridges_by_id(Type, Name, BridgesAllNodes) -> format_bridge_info_with_metrics([FirstBridge | _] = Bridges) -> Res = maps:remove(node, FirstBridge), - NodeStatus = collect_status(Bridges), + NodeStatus = node_status(Bridges), NodeMetrics = collect_metrics(Bridges), redact(Res#{ status => aggregate_status(NodeStatus), @@ -765,8 +765,8 @@ format_bridge_metrics(Bridges) -> Res = format_bridge_info_with_metrics(Bridges), maps:with([metrics, node_metrics], Res). -collect_status(Bridges) -> - [maps:with([node, status], B) || B <- Bridges]. +node_status(Bridges) -> + [maps:with([node, status, status_reason], B) || B <- Bridges]. aggregate_status(AllStatus) -> Head = fun([A | _]) -> A end, @@ -837,52 +837,63 @@ format_resource( ) ). -format_resource_data(#{status := Status, metrics := Metrics}) -> - #{status => Status, metrics => format_metrics(Metrics)}; -format_resource_data(#{status := Status}) -> - #{status => Status}. +format_resource_data(ResData) -> + maps:fold(fun format_resource_data/3, #{}, maps:with([status, metrics, error], ResData)). -format_metrics(#{ - counters := #{ - 'dropped' := Dropped, - 'dropped.other' := DroppedOther, - 'dropped.expired' := DroppedExpired, - 'dropped.queue_full' := DroppedQueueFull, - 'dropped.resource_not_found' := DroppedResourceNotFound, - 'dropped.resource_stopped' := DroppedResourceStopped, - 'matched' := Matched, - 'retried' := Retried, - 'late_reply' := LateReply, - 'failed' := SentFailed, - 'success' := SentSucc, - 'received' := Rcvd +format_resource_data(error, undefined, Result) -> + Result; +format_resource_data(error, Error, Result) -> + Result#{status_reason => emqx_misc:readable_error_msg(Error)}; +format_resource_data( + metrics, + #{ + counters := #{ + 'dropped' := Dropped, + 'dropped.other' := DroppedOther, + 'dropped.expired' := DroppedExpired, + 'dropped.queue_full' := DroppedQueueFull, + 'dropped.resource_not_found' := DroppedResourceNotFound, + 'dropped.resource_stopped' := DroppedResourceStopped, + 'matched' := Matched, + 'retried' := Retried, + 'late_reply' := LateReply, + 'failed' := SentFailed, + 'success' := SentSucc, + 'received' := Rcvd + }, + gauges := Gauges, + rate := #{ + matched := #{current := Rate, last5m := Rate5m, max := RateMax} + } }, - gauges := Gauges, - rate := #{ - matched := #{current := Rate, last5m := Rate5m, max := RateMax} - } -}) -> + Result +) -> Queued = maps:get('queuing', Gauges, 0), SentInflight = maps:get('inflight', Gauges, 0), - ?METRICS( - Dropped, - DroppedOther, - DroppedExpired, - DroppedQueueFull, - DroppedResourceNotFound, - DroppedResourceStopped, - Matched, - Queued, - Retried, - LateReply, - SentFailed, - SentInflight, - SentSucc, - Rate, - Rate5m, - RateMax, - Rcvd - ). + Result#{ + metrics => + ?METRICS( + Dropped, + DroppedOther, + DroppedExpired, + DroppedQueueFull, + DroppedResourceNotFound, + DroppedResourceStopped, + Matched, + Queued, + Retried, + LateReply, + SentFailed, + SentInflight, + SentSucc, + Rate, + Rate5m, + RateMax, + Rcvd + ) + }; +format_resource_data(K, V, Result) -> + Result#{K => V}. fill_defaults(Type, RawConf) -> PackedConf = pack_bridge_conf(Type, RawConf), @@ -924,6 +935,7 @@ filter_out_request_body(Conf) -> <<"type">>, <<"name">>, <<"status">>, + <<"error">>, <<"node_status">>, <<"node_metrics">>, <<"metrics">>, diff --git a/apps/emqx_bridge/src/schema/emqx_bridge_schema.erl b/apps/emqx_bridge/src/schema/emqx_bridge_schema.erl index 74d2a5ca1..6c278a5ec 100644 --- a/apps/emqx_bridge/src/schema/emqx_bridge_schema.erl +++ b/apps/emqx_bridge/src/schema/emqx_bridge_schema.erl @@ -106,6 +106,12 @@ common_bridge_fields() -> status_fields() -> [ {"status", mk(status(), #{desc => ?DESC("desc_status")})}, + {"status_reason", + mk(binary(), #{ + required => false, + desc => ?DESC("desc_status_reason"), + example => <<"Connection refused">> + })}, {"node_status", mk( hoconsc:array(ref(?MODULE, "node_status")), @@ -190,7 +196,13 @@ fields("node_metrics") -> fields("node_status") -> [ node_name(), - {"status", mk(status(), #{})} + {"status", mk(status(), #{})}, + {"status_reason", + mk(binary(), #{ + required => false, + desc => ?DESC("desc_status_reason"), + example => <<"Connection refused">> + })} ]. desc(bridges) -> diff --git a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl index 8b388a771..b919dba6b 100644 --- a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl @@ -23,7 +23,7 @@ -include_lib("eunit/include/eunit.hrl"). -include_lib("common_test/include/ct.hrl"). -define(CONF_DEFAULT, <<"bridges: {}">>). --define(BRIDGE_TYPE, <<"webhook">>). +-define(BRIDGE_TYPE_HTTP, <<"webhook">>). -define(BRIDGE_NAME, (atom_to_binary(?FUNCTION_NAME))). -define(URL(PORT, PATH), list_to_binary( @@ -48,7 +48,7 @@ }). -define(MQTT_BRIDGE(SERVER), ?MQTT_BRIDGE(SERVER, <<"mqtt_egress_test_bridge">>)). --define(HTTP_BRIDGE(URL, TYPE, NAME), ?BRIDGE(NAME, TYPE)#{ +-define(HTTP_BRIDGE(URL, NAME), ?BRIDGE(NAME, ?BRIDGE_TYPE_HTTP)#{ <<"url">> => URL, <<"local_topic">> => <<"emqx_webhook/#">>, <<"method">> => <<"post">>, @@ -57,6 +57,7 @@ <<"content-type">> => <<"application/json">> } }). +-define(HTTP_BRIDGE(URL), ?HTTP_BRIDGE(URL, ?BRIDGE_NAME)). all() -> emqx_common_test_helpers:all(?MODULE). @@ -206,12 +207,12 @@ t_http_crud_apis(Config) -> {ok, 201, Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), %ct:pal("---bridge: ~p", [Bridge]), #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := _, @@ -219,7 +220,7 @@ t_http_crud_apis(Config) -> <<"url">> := URL1 } = emqx_json:decode(Bridge, [return_maps]), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), %% send an message to emqx and the message should be forwarded to the HTTP server Body = <<"my msg">>, emqx:publish(emqx_message:make(<<"emqx_webhook/1">>, Body)), @@ -243,11 +244,11 @@ t_http_crud_apis(Config) -> {ok, 200, Bridge2} = request( put, uri(["bridges", BridgeID]), - ?HTTP_BRIDGE(URL2, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL2, Name) ), ?assertMatch( #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := _, @@ -262,7 +263,7 @@ t_http_crud_apis(Config) -> ?assertMatch( [ #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := _, @@ -279,7 +280,7 @@ t_http_crud_apis(Config) -> {ok, 200, Bridge3Str} = request(get, uri(["bridges", BridgeID]), []), ?assertMatch( #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := _, @@ -311,7 +312,7 @@ t_http_crud_apis(Config) -> {ok, 404, ErrMsg2} = request( put, uri(["bridges", BridgeID]), - ?HTTP_BRIDGE(URL2, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL2, Name) ), ?assertMatch( #{ @@ -340,6 +341,34 @@ t_http_crud_apis(Config) -> }, emqx_json:decode(ErrMsg3, [return_maps]) ), + + %% Create non working bridge + BrokenURL = ?URL(Port + 1, "/foo"), + {ok, 201, BrokenBridge} = request( + post, + uri(["bridges"]), + ?HTTP_BRIDGE(BrokenURL, Name) + ), + #{ + <<"type">> := ?BRIDGE_TYPE_HTTP, + <<"name">> := Name, + <<"enable">> := true, + <<"status">> := <<"disconnected">>, + <<"status_reason">> := <<"Connection refused">>, + <<"node_status">> := [ + #{<<"status">> := <<"disconnected">>, <<"status_reason">> := <<"Connection refused">>} + | _ + ], + <<"url">> := BrokenURL + } = emqx_json:decode(BrokenBridge, [return_maps]), + {ok, 200, FixedBridgeResponse} = request(put, uri(["bridges", BridgeID]), ?HTTP_BRIDGE(URL1)), + #{ + <<"status">> := <<"connected">>, + <<"node_status">> := [FixedNodeStatus = #{<<"status">> := <<"connected">>} | _] + } = FixedBridge = emqx_json:decode(FixedBridgeResponse, [return_maps]), + ?assert(not maps:is_key(<<"status_reason">>, FixedBridge)), + ?assert(not maps:is_key(<<"status_reason">>, FixedNodeStatus)), + {ok, 204, <<>>} = request(delete, uri(["bridges", BridgeID]), []), ok. t_http_bridges_local_topic(Config) -> @@ -356,16 +385,16 @@ t_http_bridges_local_topic(Config) -> {ok, 201, _} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name1) + ?HTTP_BRIDGE(URL1, Name1) ), %% and we create another one without local_topic {ok, 201, _} = request( post, uri(["bridges"]), - maps:remove(<<"local_topic">>, ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name2)) + maps:remove(<<"local_topic">>, ?HTTP_BRIDGE(URL1, Name2)) ), - BridgeID1 = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name1), - BridgeID2 = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name2), + BridgeID1 = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name1), + BridgeID2 = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name2), %% Send an message to emqx and the message should be forwarded to the HTTP server. %% This is to verify we can have 2 bridges with and without local_topic fields %% at the same time. @@ -400,11 +429,11 @@ t_check_dependent_actions_on_delete(Config) -> %% POST /bridges/ will create a bridge URL1 = ?URL(Port, "path1"), Name = <<"t_http_crud_apis">>, - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), {ok, 201, _} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), {ok, 201, Rule} = request( post, @@ -438,11 +467,11 @@ t_cascade_delete_actions(Config) -> %% POST /bridges/ will create a bridge URL1 = ?URL(Port, "path1"), Name = <<"t_http_crud_apis">>, - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), {ok, 201, _} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), {ok, 201, Rule} = request( post, @@ -472,7 +501,7 @@ t_cascade_delete_actions(Config) -> {ok, 201, _} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), {ok, 201, _} = request( post, @@ -496,9 +525,9 @@ t_broken_bpapi_vsn(Config) -> {ok, 201, _Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), %% still works since we redirect to 'restart' {ok, 501, <<>>} = request(post, operation_path(cluster, start, BridgeID), <<"">>), {ok, 501, <<>>} = request(post, operation_path(node, start, BridgeID), <<"">>), @@ -511,9 +540,9 @@ t_old_bpapi_vsn(Config) -> {ok, 201, _Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), {ok, 204, <<>>} = request(post, operation_path(cluster, stop, BridgeID), <<"">>), {ok, 204, <<>>} = request(post, operation_path(node, stop, BridgeID), <<"">>), %% still works since we redirect to 'restart' @@ -551,18 +580,18 @@ do_start_stop_bridges(Type, Config) -> {ok, 201, Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), %ct:pal("the bridge ==== ~p", [Bridge]), #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := <<"connected">>, <<"node_status">> := [_ | _], <<"url">> := URL1 } = emqx_json:decode(Bridge, [return_maps]), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), %% stop it {ok, 204, <<>>} = request(post, operation_path(Type, stop, BridgeID), <<"">>), {ok, 200, Bridge2} = request(get, uri(["bridges", BridgeID]), []), @@ -633,18 +662,18 @@ t_enable_disable_bridges(Config) -> {ok, 201, Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), %ct:pal("the bridge ==== ~p", [Bridge]), #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := <<"connected">>, <<"node_status">> := [_ | _], <<"url">> := URL1 } = emqx_json:decode(Bridge, [return_maps]), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), %% disable it {ok, 204, <<>>} = request(put, enable_path(false, BridgeID), <<"">>), {ok, 200, Bridge2} = request(get, uri(["bridges", BridgeID]), []), @@ -690,18 +719,18 @@ t_reset_bridges(Config) -> {ok, 201, Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), %ct:pal("the bridge ==== ~p", [Bridge]), #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := <<"connected">>, <<"node_status">> := [_ | _], <<"url">> := URL1 } = emqx_json:decode(Bridge, [return_maps]), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), {ok, 204, <<>>} = request(put, uri(["bridges", BridgeID, "metrics/reset"]), []), %% delete the bridge @@ -748,20 +777,20 @@ t_bridges_probe(Config) -> {ok, 204, <<>>} = request( post, uri(["bridges_probe"]), - ?HTTP_BRIDGE(URL, ?BRIDGE_TYPE, ?BRIDGE_NAME) + ?HTTP_BRIDGE(URL) ), %% second time with same name is ok since no real bridge created {ok, 204, <<>>} = request( post, uri(["bridges_probe"]), - ?HTTP_BRIDGE(URL, ?BRIDGE_TYPE, ?BRIDGE_NAME) + ?HTTP_BRIDGE(URL) ), {ok, 400, NxDomain} = request( post, uri(["bridges_probe"]), - ?HTTP_BRIDGE(<<"http://203.0.113.3:1234/foo">>, ?BRIDGE_TYPE, ?BRIDGE_NAME) + ?HTTP_BRIDGE(<<"http://203.0.113.3:1234/foo">>) ), ?assertMatch( #{ @@ -882,12 +911,12 @@ t_metrics(Config) -> {ok, 201, Bridge} = request( post, uri(["bridges"]), - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name) + ?HTTP_BRIDGE(URL1, Name) ), %ct:pal("---bridge: ~p", [Bridge]), #{ - <<"type">> := ?BRIDGE_TYPE, + <<"type">> := ?BRIDGE_TYPE_HTTP, <<"name">> := Name, <<"enable">> := true, <<"status">> := _, @@ -895,7 +924,7 @@ t_metrics(Config) -> <<"url">> := URL1 } = emqx_json:decode(Bridge, [return_maps]), - BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE, Name), + BridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_HTTP, Name), %% check for empty bridge metrics {ok, 200, Bridge1Str} = request(get, uri(["bridges", BridgeID, "metrics"]), []), @@ -963,7 +992,7 @@ t_inconsistent_webhook_request_timeouts(Config) -> Name = ?BRIDGE_NAME, BadBridgeParams = emqx_map_lib:deep_merge( - ?HTTP_BRIDGE(URL1, ?BRIDGE_TYPE, Name), + ?HTTP_BRIDGE(URL1, Name), #{ <<"request_timeout">> => <<"1s">>, <<"resource_opts">> => #{<<"request_timeout">> => <<"2s">>} diff --git a/apps/emqx_resource/include/emqx_resource.hrl b/apps/emqx_resource/include/emqx_resource.hrl index 41be9e8a0..ae22e27e0 100644 --- a/apps/emqx_resource/include/emqx_resource.hrl +++ b/apps/emqx_resource/include/emqx_resource.hrl @@ -41,6 +41,7 @@ callback_mode := callback_mode(), query_mode := query_mode(), config := resource_config(), + error := term(), state := resource_state(), status := resource_status(), metrics => emqx_metrics_worker:metrics() diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 40f9fe1ab..1bc6a3b6a 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -666,6 +666,7 @@ maybe_reply(Actions, From, Reply) -> data_record_to_external_map(Data) -> #{ id => Data#data.id, + error => Data#data.error, mod => Data#data.mod, callback_mode => Data#data.callback_mode, query_mode => Data#data.query_mode, diff --git a/changes/ce/fix-10145.en.md b/changes/ce/fix-10145.en.md new file mode 100644 index 000000000..bddbd9085 --- /dev/null +++ b/changes/ce/fix-10145.en.md @@ -0,0 +1 @@ +Fix `bridges` API to report error conditions for a failing bridge as `status_reason`. From c1384b6e6ed805985e93b13d5083a974dbc55fa8 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Wed, 15 Mar 2023 14:52:03 +0100 Subject: [PATCH 2/6] feat(emqx_resource): include error with alarm for resource_down --- .../emqx_resource/src/emqx_resource_manager.erl | 17 +++++++++++------ changes/ce/fix-10145.en.md | 4 +++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 1bc6a3b6a..b21ffcae3 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -522,7 +522,7 @@ start_resource(Data, From) -> id => Data#data.id, reason => Reason }), - _ = maybe_alarm(disconnected, Data#data.id), + _ = maybe_alarm(disconnected, Data#data.id, Data#data.error), %% Keep track of the error reason why the connection did not work %% so that the Reason can be returned when the verification call is made. UpdatedData = Data#data{status = disconnected, error = Reason}, @@ -597,7 +597,7 @@ with_health_check(Data, Func) -> ResId = Data#data.id, HCRes = emqx_resource:call_health_check(Data#data.manager_id, Data#data.mod, Data#data.state), {Status, NewState, Err} = parse_health_check_result(HCRes, Data), - _ = maybe_alarm(Status, ResId), + _ = maybe_alarm(Status, ResId, Err), ok = maybe_resume_resource_workers(ResId, Status), UpdatedData = Data#data{ state = NewState, status = Status, error = Err @@ -616,15 +616,20 @@ update_state(Data, _DataWas) -> health_check_interval(Opts) -> maps:get(health_check_interval, Opts, ?HEALTHCHECK_INTERVAL). -maybe_alarm(connected, _ResId) -> +maybe_alarm(connected, _ResId, _Error) -> ok; -maybe_alarm(_Status, <>) -> +maybe_alarm(_Status, <>, _Error) -> ok; -maybe_alarm(_Status, ResId) -> +maybe_alarm(_Status, ResId, Error) -> + HrError = + case Error of + undefined -> <<"Unknown reason">>; + _Else -> emqx_misc:readable_error_msg(Error) + end, emqx_alarm:activate( ResId, #{resource_id => ResId, reason => resource_down}, - <<"resource down: ", ResId/binary>> + <<"resource down: ", HrError/binary>> ). maybe_resume_resource_workers(ResId, connected) -> diff --git a/changes/ce/fix-10145.en.md b/changes/ce/fix-10145.en.md index bddbd9085..eaa896793 100644 --- a/changes/ce/fix-10145.en.md +++ b/changes/ce/fix-10145.en.md @@ -1 +1,3 @@ -Fix `bridges` API to report error conditions for a failing bridge as `status_reason`. +Fix `bridges` API to report error conditions for a failing bridge as +`status_reason`. Also when creating an alarm for a failing resource we include +this error condition with the alarm's message. From 8af3fb4ee7536170159001eb8282db69e96d15b4 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Wed, 15 Mar 2023 14:53:18 +0100 Subject: [PATCH 3/6] feat: move human readable error translations to emqx_misc --- apps/emqx/src/emqx_misc.erl | 15 ++++++++++++++- apps/emqx_bridge/src/emqx_bridge_api.erl | 17 ++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/apps/emqx/src/emqx_misc.erl b/apps/emqx/src/emqx_misc.erl index 18ecc644a..59b5b1be9 100644 --- a/apps/emqx/src/emqx_misc.erl +++ b/apps/emqx/src/emqx_misc.erl @@ -545,10 +545,23 @@ readable_error_msg(Error) -> {ok, Msg} -> Msg; false -> - iolist_to_binary(io_lib:format("~0p", [Error])) + to_hr_error(Error) end end. +to_hr_error(nxdomain) -> + <<"Host not found">>; +to_hr_error(econnrefused) -> + <<"Connection refused">>; +to_hr_error({unauthorized_client, _}) -> + <<"Unauthorized client">>; +to_hr_error({not_authorized, _}) -> + <<"Not authorized">>; +to_hr_error({malformed_username_or_password, _}) -> + <<"Malformed username or password">>; +to_hr_error(Error) -> + iolist_to_binary(io_lib:format("~0p", [Error])). + try_to_existing_atom(Convert, Data, Encoding) -> try Convert(Data, Encoding) of Atom -> diff --git a/apps/emqx_bridge/src/emqx_bridge_api.erl b/apps/emqx_bridge/src/emqx_bridge_api.erl index c0e69ce83..e466b418d 100644 --- a/apps/emqx_bridge/src/emqx_bridge_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_api.erl @@ -568,7 +568,7 @@ schema("/bridges_probe") -> ok -> 204; {error, Reason} when not is_tuple(Reason); element(1, Reason) =/= 'exit' -> - {400, error_msg('TEST_FAILED', to_hr_reason(Reason))} + {400, error_msg('TEST_FAILED', emqx_misc:readable_error_msg(Reason))} end; BadRequest -> BadRequest @@ -979,7 +979,7 @@ call_operation(NodeOrAll, OperFunc, Args = [_Nodes, BridgeType, BridgeName]) -> {error, {node_not_found, Node}} -> ?NOT_FOUND(<<"Node not found: ", (atom_to_binary(Node))/binary>>); {error, Reason} when not is_tuple(Reason); element(1, Reason) =/= 'exit' -> - ?BAD_REQUEST(to_hr_reason(Reason)) + ?BAD_REQUEST(emqx_misc:readable_error_msg(Reason)) end. maybe_try_restart(all, start_bridges_to_all_nodes, Args) -> @@ -1018,19 +1018,6 @@ supported_versions(start_bridge_to_node) -> [2, 3]; supported_versions(start_bridges_to_all_nodes) -> [2, 3]; supported_versions(_Call) -> [1, 2, 3]. -to_hr_reason(nxdomain) -> - <<"Host not found">>; -to_hr_reason(econnrefused) -> - <<"Connection refused">>; -to_hr_reason({unauthorized_client, _}) -> - <<"Unauthorized client">>; -to_hr_reason({not_authorized, _}) -> - <<"Not authorized">>; -to_hr_reason({malformed_username_or_password, _}) -> - <<"Malformed username or password">>; -to_hr_reason(Reason) -> - Reason. - redact(Term) -> emqx_misc:redact(Term). From 84fc64822ef532ac51c53eeac0cff08195e883e4 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Tue, 21 Mar 2023 10:29:24 +0100 Subject: [PATCH 4/6] style: fix wording for nxdomain and malformed_username_or_password Co-authored-by: Zaiming (Stone) Shi --- apps/emqx/src/emqx_misc.erl | 4 ++-- apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/emqx/src/emqx_misc.erl b/apps/emqx/src/emqx_misc.erl index 59b5b1be9..cdd62df11 100644 --- a/apps/emqx/src/emqx_misc.erl +++ b/apps/emqx/src/emqx_misc.erl @@ -550,7 +550,7 @@ readable_error_msg(Error) -> end. to_hr_error(nxdomain) -> - <<"Host not found">>; + <<"Could not resolve host">>; to_hr_error(econnrefused) -> <<"Connection refused">>; to_hr_error({unauthorized_client, _}) -> @@ -558,7 +558,7 @@ to_hr_error({unauthorized_client, _}) -> to_hr_error({not_authorized, _}) -> <<"Not authorized">>; to_hr_error({malformed_username_or_password, _}) -> - <<"Malformed username or password">>; + <<"Bad username or password">>; to_hr_error(Error) -> iolist_to_binary(io_lib:format("~0p", [Error])). diff --git a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl index b919dba6b..986c0d29d 100644 --- a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl @@ -819,7 +819,7 @@ t_bridges_probe(Config) -> emqx_json:decode(ConnRefused, [return_maps]) ), - {ok, 400, HostNotFound} = request( + {ok, 400, CouldNotResolveHost} = request( post, uri(["bridges_probe"]), ?MQTT_BRIDGE(<<"nohost:2883">>) @@ -827,9 +827,9 @@ t_bridges_probe(Config) -> ?assertMatch( #{ <<"code">> := <<"TEST_FAILED">>, - <<"message">> := <<"Host not found">> + <<"message">> := <<"Could not resolve host">> }, - emqx_json:decode(HostNotFound, [return_maps]) + emqx_json:decode(CouldNotResolveHost, [return_maps]) ), AuthnConfig = #{ @@ -873,7 +873,7 @@ t_bridges_probe(Config) -> ?assertMatch( #{ <<"code">> := <<"TEST_FAILED">>, - <<"message">> := <<"Malformed username or password">> + <<"message">> := <<"Bad username or password">> }, emqx_json:decode(Malformed, [return_maps]) ), From 4b0ea562a28e0ac8ae874998fd8b12c40855b04c Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Tue, 21 Mar 2023 14:21:30 +0100 Subject: [PATCH 5/6] refactor(emqx_bridge): consistently use macros for http response --- apps/emqx_bridge/src/emqx_bridge_api.erl | 79 +++++++++++++----------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/apps/emqx_bridge/src/emqx_bridge_api.erl b/apps/emqx_bridge/src/emqx_bridge_api.erl index e466b418d..917b44096 100644 --- a/apps/emqx_bridge/src/emqx_bridge_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_api.erl @@ -46,18 +46,33 @@ -export([lookup_from_local_node/2]). --define(BAD_REQUEST(Reason), {400, error_msg('BAD_REQUEST', Reason)}). +%% [TODO] Move those to a commonly shared header file +-define(ERROR_MSG(CODE, REASON), #{code => CODE, message => emqx_misc:readable_error_msg(REASON)}). + +-define(OK(CONTENT), {200, CONTENT}). + +-define(NO_CONTENT, 204). + +-define(BAD_REQUEST(CODE, REASON), {400, ?ERROR_MSG(CODE, REASON)}). +-define(BAD_REQUEST(REASON), ?BAD_REQUEST('BAD_REQUEST', REASON)). + +-define(NOT_FOUND(REASON), {404, ?ERROR_MSG('NOT_FOUND', REASON)}). + +-define(INTERNAL_ERROR(REASON), {500, ?ERROR_MSG('INTERNAL_ERROR', REASON)}). + +-define(NOT_IMPLEMENTED, 501). + +-define(SERVICE_UNAVAILABLE(REASON), {503, ?ERROR_MSG('SERVICE_UNAVAILABLE', REASON)}). +%% End TODO -define(BRIDGE_NOT_ENABLED, ?BAD_REQUEST(<<"Forbidden operation, bridge not enabled">>) ). --define(NOT_FOUND(Reason), {404, error_msg('NOT_FOUND', Reason)}). - --define(BRIDGE_NOT_FOUND(BridgeType, BridgeName), +-define(BRIDGE_NOT_FOUND(BRIDGE_TYPE, BRIDGE_NAME), ?NOT_FOUND( - <<"Bridge lookup failed: bridge named '", BridgeName/binary, "' of type ", - (atom_to_binary(BridgeType))/binary, " does not exist.">> + <<"Bridge lookup failed: bridge named '", BRIDGE_NAME/binary, "' of type ", + (atom_to_binary(BRIDGE_TYPE))/binary, " does not exist.">> ) ). @@ -480,7 +495,7 @@ schema("/bridges_probe") -> '/bridges'(post, #{body := #{<<"type">> := BridgeType, <<"name">> := BridgeName} = Conf0}) -> case emqx_bridge:lookup(BridgeType, BridgeName) of {ok, _} -> - {400, error_msg('ALREADY_EXISTS', <<"bridge already exists">>)}; + ?BAD_REQUEST('ALREADY_EXISTS', <<"bridge already exists">>); {error, not_found} -> Conf = filter_out_request_body(Conf0), {ok, _} = emqx_bridge:create(BridgeType, BridgeName, Conf), @@ -495,9 +510,9 @@ schema("/bridges_probe") -> format_resource(Data, Node) || {Node, Bridges} <- lists:zip(Nodes, NodeBridges), Data <- Bridges ], - {200, zip_bridges([AllBridges])}; + ?OK(zip_bridges([AllBridges])); {error, Reason} -> - {500, error_msg('INTERNAL_ERROR', Reason)} + ?INTERNAL_ERROR(Reason) end. '/bridges/:id'(get, #{bindings := #{id := Id}}) -> @@ -529,16 +544,16 @@ schema("/bridges_probe") -> end, case emqx_bridge:check_deps_and_remove(BridgeType, BridgeName, AlsoDeleteActs) of {ok, _} -> - 204; + ?NO_CONTENT; {error, {rules_deps_on_this_bridge, RuleIds}} -> ?BAD_REQUEST( {<<"Cannot delete bridge while active rules are defined for this bridge">>, RuleIds} ); {error, timeout} -> - {503, error_msg('SERVICE_UNAVAILABLE', <<"request timeout">>)}; + ?SERVICE_UNAVAILABLE(<<"request timeout">>); {error, Reason} -> - {500, error_msg('INTERNAL_ERROR', Reason)} + ?INTERNAL_ERROR(Reason) end; {error, not_found} -> ?BRIDGE_NOT_FOUND(BridgeType, BridgeName) @@ -555,7 +570,7 @@ schema("/bridges_probe") -> ok = emqx_bridge_resource:reset_metrics( emqx_bridge_resource:resource_id(BridgeType, BridgeName) ), - {204} + ?NO_CONTENT end ). @@ -566,9 +581,9 @@ schema("/bridges_probe") -> Params1 = maybe_deobfuscate_bridge_probe(Params), case emqx_bridge_resource:create_dry_run(ConnType, maps:remove(<<"type">>, Params1)) of ok -> - 204; + ?NO_CONTENT; {error, Reason} when not is_tuple(Reason); element(1, Reason) =/= 'exit' -> - {400, error_msg('TEST_FAILED', emqx_misc:readable_error_msg(Reason))} + ?BAD_REQUEST('TEST_FAILED', Reason) end; BadRequest -> BadRequest @@ -602,7 +617,7 @@ do_lookup_from_all_nodes(BridgeType, BridgeName, SuccCode, FormatFun) -> {ok, [{error, not_found} | _]} -> ?BRIDGE_NOT_FOUND(BridgeType, BridgeName); {error, Reason} -> - {500, error_msg('INTERNAL_ERROR', Reason)} + ?INTERNAL_ERROR(Reason) end. lookup_from_local_node(BridgeType, BridgeName) -> @@ -620,15 +635,15 @@ lookup_from_local_node(BridgeType, BridgeName) -> OperFunc -> case emqx_bridge:disable_enable(OperFunc, BridgeType, BridgeName) of {ok, _} -> - 204; + ?NO_CONTENT; {error, {pre_config_update, _, bridge_not_found}} -> ?BRIDGE_NOT_FOUND(BridgeType, BridgeName); {error, {_, _, timeout}} -> - {503, error_msg('SERVICE_UNAVAILABLE', <<"request timeout">>)}; + ?SERVICE_UNAVAILABLE(<<"request timeout">>); {error, timeout} -> - {503, error_msg('SERVICE_UNAVAILABLE', <<"request timeout">>)}; + ?SERVICE_UNAVAILABLE(<<"request timeout">>); {error, Reason} -> - {500, error_msg('INTERNAL_ERROR', Reason)} + ?INTERNAL_ERROR(Reason) end end ). @@ -943,9 +958,6 @@ filter_out_request_body(Conf) -> ], maps:without(ExtraConfs, Conf). -error_msg(Code, Msg) -> - #{code => Code, message => emqx_misc:readable_error_msg(Msg)}. - bin(S) when is_list(S) -> list_to_binary(S); bin(S) when is_atom(S) -> @@ -956,30 +968,23 @@ bin(S) when is_binary(S) -> call_operation(NodeOrAll, OperFunc, Args = [_Nodes, BridgeType, BridgeName]) -> case is_ok(do_bpapi_call(NodeOrAll, OperFunc, Args)) of Ok when Ok =:= ok; is_tuple(Ok), element(1, Ok) =:= ok -> - 204; + ?NO_CONTENT; {error, not_implemented} -> %% Should only happen if we call `start` on a node that is %% still on an older bpapi version that doesn't support it. maybe_try_restart(NodeOrAll, OperFunc, Args); {error, timeout} -> - {503, error_msg('SERVICE_UNAVAILABLE', <<"request timeout">>)}; + ?SERVICE_UNAVAILABLE(<<"Request timeout">>); {error, {start_pool_failed, Name, Reason}} -> - {503, - error_msg( - 'SERVICE_UNAVAILABLE', - bin( - io_lib:format( - "failed to start ~p pool for reason ~p", - [Name, Reason] - ) - ) - )}; + ?SERVICE_UNAVAILABLE( + bin(io_lib:format("Failed to start ~p pool for reason ~p", [Name, Reason])) + ); {error, not_found} -> ?BRIDGE_NOT_FOUND(BridgeType, BridgeName); {error, {node_not_found, Node}} -> ?NOT_FOUND(<<"Node not found: ", (atom_to_binary(Node))/binary>>); {error, Reason} when not is_tuple(Reason); element(1, Reason) =/= 'exit' -> - ?BAD_REQUEST(emqx_misc:readable_error_msg(Reason)) + ?BAD_REQUEST(Reason) end. maybe_try_restart(all, start_bridges_to_all_nodes, Args) -> @@ -987,7 +992,7 @@ maybe_try_restart(all, start_bridges_to_all_nodes, Args) -> maybe_try_restart(Node, start_bridge_to_node, Args) -> call_operation(Node, restart_bridge_to_node, Args); maybe_try_restart(_, _, _) -> - 501. + ?NOT_IMPLEMENTED. do_bpapi_call(all, Call, Args) -> maybe_unwrap( From 3880862c810b386b35ebcd83fe0c596cb10b3809 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Tue, 21 Mar 2023 14:22:04 +0100 Subject: [PATCH 6/6] fix(emqx_bridge): return 503 for inconsistency in bridge setup --- apps/emqx_bridge/src/emqx_bridge_api.erl | 10 ++++++- .../test/emqx_bridge_api_SUITE.erl | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/apps/emqx_bridge/src/emqx_bridge_api.erl b/apps/emqx_bridge/src/emqx_bridge_api.erl index 917b44096..f7a1bb345 100644 --- a/apps/emqx_bridge/src/emqx_bridge_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_api.erl @@ -980,7 +980,15 @@ call_operation(NodeOrAll, OperFunc, Args = [_Nodes, BridgeType, BridgeName]) -> bin(io_lib:format("Failed to start ~p pool for reason ~p", [Name, Reason])) ); {error, not_found} -> - ?BRIDGE_NOT_FOUND(BridgeType, BridgeName); + BridgeId = emqx_bridge_resource:bridge_id(BridgeType, BridgeName), + ?SLOG(warning, #{ + msg => "bridge_inconsistent_in_cluster_for_call_operation", + reason => not_found, + type => BridgeType, + name => BridgeName, + bridge => BridgeId + }), + ?SERVICE_UNAVAILABLE(<<"Bridge not found on remote node: ", BridgeId/binary>>); {error, {node_not_found, Node}} -> ?NOT_FOUND(<<"Node not found: ", (atom_to_binary(Node))/binary>>); {error, Reason} when not is_tuple(Reason); element(1, Reason) =/= 'exit' -> diff --git a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl index 986c0d29d..45ab2b623 100644 --- a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl @@ -98,6 +98,20 @@ init_per_testcase(t_old_bpapi_vsn, Config) -> meck:expect(emqx_bpapi, supported_version, 1, 1), meck:expect(emqx_bpapi, supported_version, 2, 1), init_per_testcase(common, Config); +init_per_testcase(StartStop, Config) when + StartStop == t_start_stop_bridges_cluster; + StartStop == t_start_stop_bridges_node +-> + meck:new(emqx_bridge_resource, [passthrough]), + meck:expect( + emqx_bridge_resource, + stop, + fun + (_, <<"bridge_not_found">>) -> {error, not_found}; + (Type, Name) -> meck:passthrough([Type, Name]) + end + ), + init_per_testcase(common, Config); init_per_testcase(_, Config) -> {ok, _} = emqx_cluster_rpc:start_link(node(), emqx_cluster_rpc, 1000), {Port, Sock, Acceptor} = start_http_server(fun handle_fun_200_ok/2), @@ -109,6 +123,12 @@ end_per_testcase(t_broken_bpapi_vsn, Config) -> end_per_testcase(t_old_bpapi_vsn, Config) -> meck:unload([emqx_bpapi]), end_per_testcase(common, Config); +end_per_testcase(StartStop, Config) when + StartStop == t_start_stop_bridges_cluster; + StartStop == t_start_stop_bridges_node +-> + meck:unload([emqx_bridge_resource]), + end_per_testcase(common, Config); end_per_testcase(_, Config) -> Sock = ?config(sock, Config), Acceptor = ?config(acceptor, Config), @@ -626,6 +646,16 @@ do_start_stop_bridges(Type, Config) -> %% Looks ok but doesn't exist {ok, 404, _} = request(post, operation_path(Type, start, <<"webhook:cptn_hook">>), <<"">>), + %% + {ok, 201, _Bridge} = request( + post, + uri(["bridges"]), + ?HTTP_BRIDGE(URL1, <<"bridge_not_found">>) + ), + {ok, 503, _} = request( + post, operation_path(Type, stop, <<"webhook:bridge_not_found">>), <<"">> + ), + %% Create broken bridge {ListenPort, Sock} = listen_on_random_port(), %% Connecting to this endpoint should always timeout