From f29bbb6b16b8149c5893726b14c52fb4e9683ede Mon Sep 17 00:00:00 2001 From: Mike Hearn Date: Mon, 14 May 2018 16:41:24 +0200 Subject: [PATCH] Docs: Docs: switch from ReCommonMark to m2r, as RCM is not really maintained and is buggy. Tidy up presentation of HA design doc. Misc markup fixes throughout, as revealed by m2r. --- docs/requirements.txt | 5 +- docs/source/conf.py | 20 +- .../design/certificate-hierarchies/design.md | 4 - docs/source/design/designTemplate/design.md | 5 +- .../design.md | 5 - .../design/float/decisions/e2e-encryption.md | 7 +- .../design/float/decisions/p2p-protocol.md | 46 ++-- .../float/decisions/pluggable-broker.md | 2 - docs/source/design/float/design.md | 9 +- .../design/hadr/decisions/crash-shell.md | 16 +- .../design/hadr/decisions/db-msg-store.md | 17 +- .../hadr/decisions/drb-meeting-20171116.md | 12 +- .../design/hadr/decisions/external-broker.md | 27 +- .../design/hadr/decisions/ip-addressing.md | 16 +- .../hadr/decisions/medium-term-target.md | 20 +- .../design/hadr/decisions/near-term-target.md | 18 +- docs/source/design/hadr/design.md | 234 +++++++++++------- ...deployment - Hot-Cold.png => hot-cold.png} | Bin ...A deployment - Hot-Hot.png => hot-hot.png} | Bin ...deployment - Hot-Warm.png => hot-warm.png} | Bin .../{HA deployment - No HA.png => no-ha.png} | Bin docs/source/index.rst | 1 + 22 files changed, 244 insertions(+), 220 deletions(-) rename docs/source/design/hadr/{HA deployment - Hot-Cold.png => hot-cold.png} (100%) rename docs/source/design/hadr/{HA deployment - Hot-Hot.png => hot-hot.png} (100%) rename docs/source/design/hadr/{HA deployment - Hot-Warm.png => hot-warm.png} (100%) rename docs/source/design/hadr/{HA deployment - No HA.png => no-ha.png} (100%) diff --git a/docs/requirements.txt b/docs/requirements.txt index 91cd59da01..bf8d5e0d98 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,17 +8,18 @@ future==0.16.0 idna==2.6 imagesize==0.7.1 Jinja2==2.8 +m2r==0.1.14 MarkupSafe==0.23 +mistune==0.8.3 packaging==17.1 pdfrw==0.4 Pillow==5.1.0 Pygments==2.2.0 pyparsing==2.2.0 pytz==2016.4 -recommonmark==0.4.0 reportlab==3.4.0 requests==2.18.4 -rst2pdf==0.93.dev0 +rst2pdf==0.93 six==1.10.0 snowballstemmer==1.2.1 Sphinx==1.7.4 diff --git a/docs/source/conf.py b/docs/source/conf.py index b22f508600..5202714c97 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,6 @@ # serve to show the default. import sphinx_rtd_theme -from recommonmark.transform import AutoStructify # If extensions (or modules to document with autodoc) are in another directory, @@ -26,10 +25,8 @@ from recommonmark.transform import AutoStructify # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = ['rst2pdf.pdfbuilder'] +# m2r is a Markdown to RST converter, as our design docs use Markdown. +extensions = ['rst2pdf.pdfbuilder', 'm2r'] # PDF configuration pdf_documents = [('index', u'corda-developer-site', u'Corda Developer Documentation', u'R3')] @@ -43,9 +40,6 @@ templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md'] -source_parsers = { - '.md': 'recommonmark.parser.CommonMarkParser', -} # The encoding of source files. source_encoding = 'utf-8-sig' @@ -262,13 +256,3 @@ latex_documents = [ # If false, no module index is generated. #latex_domain_indices = True - -web_doc_root = 'https://docs.corda.net/' -def setup(app): - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: web_doc_root + url, - 'auto_toc_tree_section': 'Contents', - 'enable_eval_rst': True, - 'enable_auto_doc_ref': False, - }, True) - app.add_transform(AutoStructify) \ No newline at end of file diff --git a/docs/source/design/certificate-hierarchies/design.md b/docs/source/design/certificate-hierarchies/design.md index b4a99e395e..dfc609ffb1 100644 --- a/docs/source/design/certificate-hierarchies/design.md +++ b/docs/source/design/certificate-hierarchies/design.md @@ -1,8 +1,6 @@ # Certificate hierarchies -```eval_rst .. important:: This design doc applies to the main Corda network. Other networks may use different certificate hierarchies. -``` ## Overview @@ -53,13 +51,11 @@ context. ## Design Decisions -```eval_rst .. toctree:: :maxdepth: 2 decisions/levels.md decisions/tls-trust-root.md -``` ## **Target** Solution diff --git a/docs/source/design/designTemplate/design.md b/docs/source/design/designTemplate/design.md index 601e2f7074..8edba01a58 100644 --- a/docs/source/design/designTemplate/design.md +++ b/docs/source/design/designTemplate/design.md @@ -44,13 +44,12 @@ Design documents should follow the standard GitHub version management and pull r #### Design Decisions -| Description | Recommendation | Approval* | +| Description | Recommendation | Approval | | ---------------------------------------- | --------------- | ----------------------- | | [Design Decision 1](decisions/decision.md) | Selected option | (Design Approval Board) | | [Design Decision 2](decisions/decision.md) | Selected option | (Design Approval Board) | | [Design Decision 3](decisions/decision.md) | Selected option | (Design Approval Board) | -\* only required for formal Design Approval Board meetings. ## Document History @@ -116,7 +115,7 @@ List of design decisions identified in defining the target solution: | [Design Decision 3](decisions/decision.md) | Option B | It is reasonable to expect decisions to be challenged prior to any formal review and approval. -*In certain scenarios the Design Decision itself may solicit a recommendation from reviewers. +In certain scenarios the Design Decision itself may solicit a recommendation from reviewers. ## Target Solution diff --git a/docs/source/design/failure-detection-master-election/design.md b/docs/source/design/failure-detection-master-election/design.md index c9f2da8fae..1d9a85a1ba 100644 --- a/docs/source/design/failure-detection-master-election/design.md +++ b/docs/source/design/failure-detection-master-election/design.md @@ -1,8 +1,6 @@ # Failure detection and master election -```eval_rst .. important:: This design document describes a feature of Corda Enterprise. -``` ## Background @@ -38,14 +36,11 @@ It would also be helpful for the chosen solution to not add deployment complexit ## Design decisions -```eval_rst .. toctree:: :maxdepth: 2 drb-meeting-20180131.md -``` - ## Proposed solutions Based on what is needed for Hot-Warm, 1 active node and at least one passive node (started but in stand-by mode), and diff --git a/docs/source/design/float/decisions/e2e-encryption.md b/docs/source/design/float/decisions/e2e-encryption.md index 09c217630d..0dd09bc052 100644 --- a/docs/source/design/float/decisions/e2e-encryption.md +++ b/docs/source/design/float/decisions/e2e-encryption.md @@ -47,4 +47,9 @@ Proceed with Option 2: Placeholder ## Decision taken -[DNB Meeting, 16/11/2017](./drb-meeting-20171116.md): Proceed with Option 2 - Add placeholder, subject to more detailed design proposal (RGB, JC, MH agreed) +Proceed with Option 2 - Add placeholder, subject to more detailed design proposal (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md + diff --git a/docs/source/design/float/decisions/p2p-protocol.md b/docs/source/design/float/decisions/p2p-protocol.md index edc5f35f18..bf028e8b31 100644 --- a/docs/source/design/float/decisions/p2p-protocol.md +++ b/docs/source/design/float/decisions/p2p-protocol.md @@ -14,32 +14,46 @@ Under this option, P2P messaging will follow the [Advanced Message Queuing Proto #### Advantages -1. As we have described in our marketing materials. -2. Well-defined standard. -3. Supportfor packet level flow control and explicit delivery acknowledgement. -4. Will allow eventual swap out of Artemis for other brokers. +1. As we have described in our marketing materials. +2. Well-defined standard. +3. Supportfor packet level flow control and explicit delivery acknowledgement. +4. Will allow eventual swap out of Artemis for other brokers. #### Disadvantages -1. AMQP is a complex protocol with many layered state machines, for which it may prove hard to verify security properties. -2. No support for secure MAC in packets frames. -3. No defined encryption mode beyond creating custom payload encryption and custom headers. -4. No standardised support for queue creation/enumeration, or deletion. -5. Use of broker durable queues and autonomousbridge transfers does not align with checkpoint timing, so that independent replication of the DB and Artemis data risks causing problems. (Writing to the DB doesn’t work currently and is probably also slow). +1. AMQP is a complex protocol with many layered state machines, for which it may prove hard to verify security properties. +2. No support for secure MAC in packets frames. +3. No defined encryption mode beyond creating custom payload encryption and custom headers. +4. No standardised support for queue creation/enumeration, or deletion. +5. Use of broker durable queues and autonomousbridge transfers does not align with checkpoint timing, so that independent replication of the DB and Artemis data risks causing problems. (Writing to the DB doesn’t work currently and is probably also slow). ### 2. Develop a custom protocol -This option would discard existing Artemis server/AMQP support for peer-to-peer communications in favour of a custom implementation of the Corda MessagingService, which takes direct responsibility for message retries and stores the pending messages into the node's database. The wire level of this service would be built on top of a fully encrypted MIX network which would not require a fully connected graph, but rather send messages on randomly selected paths over the dynamically managed network graph topology. +This option would discard existing Artemis server/AMQP support for peer-to-peer communications in favour of a custom +implementation of the Corda MessagingService, which takes direct responsibility for message retries and stores the +pending messages into the node's database. The wire level of this service would be built on top of a fully encrypted MIX +network which would not require a fully connected graph, but rather send messages on randomly selected paths over the +dynamically managed network graph topology. -Packet format would likely use the ![SPHINX packet format](http://www0.cs.ucl.ac.uk/staff/G.Danezis/papers/sphinx-eprint.pdf) although with the body encryption updated to a modern AEAD scheme as in https://www.cs.ru.nl/~bmennink/pubs/16cans.pdf . In this scheme, nodes would be identified in the overlay network solely by Curve25519 public key addresses and floats would be dumb nodes that only run the MIX network code and don't act as message sources, or sinks. Intermediate traffic would not be readable except by the intended waypoint and only the final node can read the payload. +Packet format would likely use the [SPHINX packet format](http://www0.cs.ucl.ac.uk/staff/G.Danezis/papers/sphinx-eprint.pdf) although with the body encryption updated to +a modern AEAD scheme as in https://www.cs.ru.nl/~bmennink/pubs/16cans.pdf . In this scheme, nodes would be identified in +the overlay network solely by Curve25519 public key addresses and floats would be dumb nodes that only run the MIX +network code and don't act as message sources, or sinks. Intermediate traffic would not be readable except by the +intended waypoint and only the final node can read the payload. -Point to point links would be standard TLS and the network certificates would be whatever is acceptable to the host institutions e.g. standard Verisign certs. It is assumed institutions would select partners to connect to that they trust and permission them individually in their firewalls. Inside the MIX network the nodes would be connected mostly in a static way and use standard HELLO packets to determine the liveness of neighbour routes, then use tunnelled gossip to distribute the signed/versioned Link topology messages. Nodes will also be allowed to advertise a public IP, so some dynamic links and publicly visible nodes would exist. Network map addresses would then be mappings from Legal Identity to these overlay network addresses, not to physical network locations. +Point to point links would be standard TLS and the network certificates would be whatever is acceptable to the host +institutions e.g. standard Verisign certs. It is assumed institutions would select partners to connect to that they +trust and permission them individually in their firewalls. Inside the MIX network the nodes would be connected mostly in +a static way and use standard HELLO packets to determine the liveness of neighbour routes, then use tunnelled gossip to +distribute the signed/versioned Link topology messages. Nodes will also be allowed to advertise a public IP, so some +dynamic links and publicly visible nodes would exist. Network map addresses would then be mappings from Legal Identity +to these overlay network addresses, not to physical network locations. #### Advantages 1. Can be defined with very small message surface area that is amenable to security analysis. 2. Packet formats can follow best practice cryptography from the start and be matched to Corda’s needs. -3. Doesn’t require ‘Complete Graph’ structure for network if we have intermediate routing. +3. Doesn’t require a complete graph structure for network if we have intermediate routing. 4. More closely aligns checkpointing and message delivery handling at the application level. #### Disadvantages @@ -54,4 +68,8 @@ Proceed with Option 1 ## Decision taken -[DNB Meeting, 16/11/2017](./drb-meeting-20171116.md): Proceed with Option 1 - Continue to use AMQP (RGB, JC, MH agreed) +Proceed with Option 1 - Continue to use AMQP (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/float/decisions/pluggable-broker.md b/docs/source/design/float/decisions/pluggable-broker.md index d4f0a8edcd..3096360337 100644 --- a/docs/source/design/float/decisions/pluggable-broker.md +++ b/docs/source/design/float/decisions/pluggable-broker.md @@ -55,10 +55,8 @@ Proceed with Option 2 (defer development of pluggable brokers until later) ## Decision taken -```eval_rst .. toctree:: drb-meeting-20171116.md -``` Proceed with Option 2 - Defer support for pluggable brokers until later, except in the event that a requirement to do so emerges from higher priority float / HA work. (RGB, JC, MH agreed) diff --git a/docs/source/design/float/design.md b/docs/source/design/float/design.md index 2be1d52b1e..5323526107 100644 --- a/docs/source/design/float/design.md +++ b/docs/source/design/float/design.md @@ -1,8 +1,6 @@ # Float Design -```eval_rst .. important:: This design document describes a feature of Corda Enterprise. -``` ## Overview @@ -75,7 +73,6 @@ Allow connectivity in compliance with DMZ constraints commonly imposed by modern The following design decisions fed into this design: -```eval_rst .. toctree:: :maxdepth: 2 @@ -83,8 +80,6 @@ The following design decisions fed into this design: decisions/ssl-termination.md decisions/e2e-encryption.md decisions/pluggable-broker.md - -``` ## Target Solution @@ -112,10 +107,10 @@ node, is supported. **No state will be serialized on the float**, although suitably protected logs will be recorded of all float activities. **End-to-end encryption** of the payload is not delivered through this design (see Design Decisions, above). For current -*purposes, a header field indicating plaintext/encrypted payload is employed as a placeholder. +purposes, a header field indicating plaintext/encrypted payload is employed as a placeholder. **HA** is enabled (this should be easy as the bridge manager can choose which float to make active). Only fully -*connected DMZ floats should activate their listening port. +connected DMZ floats should activate their listening port. Implementation of the float is expected to be based on existing AMQP Bridge Manager code - see Implementation Plan, below, for expected work stages. diff --git a/docs/source/design/hadr/decisions/crash-shell.md b/docs/source/design/hadr/decisions/crash-shell.md index 8c19731292..1a1d862801 100644 --- a/docs/source/design/hadr/decisions/crash-shell.md +++ b/docs/source/design/hadr/decisions/crash-shell.md @@ -1,14 +1,8 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Decision: Node starting & stopping -============================================ +# Design Decision: Node starting & stopping ## Background / Context -The potential use of a crash shell is relevant to [high availability](../design.md) capabilities of nodes. - - +The potential use of a crash shell is relevant to high availability capabilities of nodes. ## Options Analysis @@ -49,4 +43,8 @@ Proceed with Option 2: Delegate to external tools ## Decision taken -**[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md)** Restarts should be handled by polite shutdown, followed by a hard clear. (RGB, JC, MH agreed) \ No newline at end of file +Restarts should be handled by polite shutdown, followed by a hard clear. (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/decisions/db-msg-store.md b/docs/source/design/hadr/decisions/db-msg-store.md index 6af5c3f684..a2ff7e5680 100644 --- a/docs/source/design/hadr/decisions/db-msg-store.md +++ b/docs/source/design/hadr/decisions/db-msg-store.md @@ -1,14 +1,9 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Decision: Message storage -============================================ +# Design Decision: Message storage ## Background / Context -Storage of messages by the message broker has implications for replication technologies which can be used to ensure both [high availability](../design.md) and disaster recovery of Corda nodes. - - +Storage of messages by the message broker has implications for replication technologies which can be used to ensure both +[high availability](../design.md) and disaster recovery of Corda nodes. ## Options Analysis @@ -44,4 +39,8 @@ Continue with Option 1: Storage in the file system ## Decision taken -[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md) Use storage in the file system (for now) \ No newline at end of file +Use storage in the file system (for now) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/decisions/drb-meeting-20171116.md b/docs/source/design/hadr/decisions/drb-meeting-20171116.md index 155a7f9e51..d19ee14f71 100644 --- a/docs/source/design/hadr/decisions/drb-meeting-20171116.md +++ b/docs/source/design/hadr/decisions/drb-meeting-20171116.md @@ -1,13 +1,7 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Review Board Meeting Minutes -============================================ +# Design Review Board Meeting Minutes **Date / Time:** 16/11/2017, 16:30 - - ## Attendees - Mark Oldfield (MO) @@ -24,9 +18,7 @@ Design Review Board Meeting Minutes - Jonathan Sartin (JS) - David Lee (DL) - - -## **Minutes** +## Minutes The meeting re-opened following prior discussion of the float design. diff --git a/docs/source/design/hadr/decisions/external-broker.md b/docs/source/design/hadr/decisions/external-broker.md index f44c5ddd7f..e5c5720b01 100644 --- a/docs/source/design/hadr/decisions/external-broker.md +++ b/docs/source/design/hadr/decisions/external-broker.md @@ -1,14 +1,9 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Decision: Broker separation -============================================ +# Design Decision: Broker separation ## Background / Context -A decision of whether to extract the Artemis message broker as a separate component has implications for the design of [high availability](../design.md) for nodes. - - +A decision of whether to extract the Artemis message broker as a separate component has implications for the design of +[high availability](../design.md) for nodes. ## Options Analysis @@ -16,15 +11,15 @@ A decision of whether to extract the Artemis message broker as a separate compon #### Advantages -1. Least change +1. Least change #### Disadvantages -1. Means that starting/stopping Corda is tightly coupled to starting/stopping Artemis instances. -2. Risks resource leaks from one system component affecting other components. -3. Not pluggable if we wish to have an alternative broker. +1. Means that starting/stopping Corda is tightly coupled to starting/stopping Artemis instances. +2. Risks resource leaks from one system component affecting other components. +3. Not pluggable if we wish to have an alternative broker. -## 2. External broker +### 2. External broker #### Advantages @@ -46,4 +41,8 @@ Proceed with Option 2: External broker ## Decision taken -**[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md)** The broker should only be separated if required by other features (e.g. the float), otherwise not. (RGB, JC, MH agreed). \ No newline at end of file +The broker should only be separated if required by other features (e.g. the float), otherwise not. (RGB, JC, MH agreed). + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/decisions/ip-addressing.md b/docs/source/design/hadr/decisions/ip-addressing.md index e52dceabd3..07ad2d6350 100644 --- a/docs/source/design/hadr/decisions/ip-addressing.md +++ b/docs/source/design/hadr/decisions/ip-addressing.md @@ -1,14 +1,8 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Decision: IP addressing mechanism (near-term) -============================================ +# Design Decision: IP addressing mechanism (near-term) ## Background / Context -End-to-end encryption is a desirable potential design feature for the [float](../design.md). - - +End-to-end encryption is a desirable potential design feature for the [high availability support](design). ## Options Analysis @@ -45,4 +39,8 @@ Proceed with Option 1: Via Load Balancer ## Decision taken -**[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md)** The design can allow for optional load balancers to be implemented by clients. (RGB, JC, MH agreed) \ No newline at end of file +The design can allow for optional load balancers to be implemented by clients. (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/decisions/medium-term-target.md b/docs/source/design/hadr/decisions/medium-term-target.md index 184d6683f9..8f7b779a95 100644 --- a/docs/source/design/hadr/decisions/medium-term-target.md +++ b/docs/source/design/hadr/decisions/medium-term-target.md @@ -1,18 +1,14 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - ------- - # Design Decision: Medium-term target for node HA ## Background / Context -Designing for high availability is a complex task which can only be delivered over an operationally-significant timeline. It is therefore important to determine whether an intermediate state design (deliverable for around March 2018) is desirable as a precursor to longer term outcomes. - - +Designing for high availability is a complex task which can only be delivered over an operationally-significant +timeline. It is therefore important to determine whether an intermediate state design (deliverable for around March +2018) is desirable as a precursor to longer term outcomes. ## Options Analysis -### 1. Hot-warm as interim state (see [HA design doc](../design.md)) +### 1. Hot-warm as interim state #### Advantages @@ -26,7 +22,7 @@ Designing for high availability is a complex task which can only be delivered ov 2. May actually turn out more risky than hot-hot, because shutting down code is always prone to deadlocks and resource leakages. 3. Some work would have to be thrown away when we create a full hot-hot solution. -### 2. Progress immediately to Hot-hot (see [HA design doc](../design.md)) +### 2. Progress immediately to Hot-hot #### Advantages @@ -45,5 +41,9 @@ Proceed with Option 1: Hot-warm as interim state. ## Decision taken -**[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md)** Adopt option 1: Medium-term target: Hot Warm (RGB, JC, MH agreed) +Adopt option 1: Medium-term target: Hot Warm (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/decisions/near-term-target.md b/docs/source/design/hadr/decisions/near-term-target.md index 92b7fd1147..6461e18f27 100644 --- a/docs/source/design/hadr/decisions/near-term-target.md +++ b/docs/source/design/hadr/decisions/near-term-target.md @@ -1,14 +1,10 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - --------------------------------------------- -Design Decision: Near-term target for node HA -============================================ +# Design Decision: Near-term target for node HA ## Background / Context -Designing for high availability is a complex task which can only be delivered over an operationally-significant timeline. It is therefore important to determine the target state in the near term as a precursor to longer term outcomes. - - +Designing for high availability is a complex task which can only be delivered over an operationally-significant +timeline. It is therefore important to determine the target state in the near term as a precursor to longer term +outcomes. ## Options Analysis @@ -43,4 +39,8 @@ Proceed with Option 2: Hot-cold. ## Decision taken -**[DRB meeting, 16/11/2017:](./drb-meeting-20171116.md)** Adopt option 2: Near-term target: Hot Cold (RGB, JC, MH agreed) +Adopt option 2: Near-term target: Hot Cold (RGB, JC, MH agreed) + +.. toctree:: + + drb-meeting-20171116.md diff --git a/docs/source/design/hadr/design.md b/docs/source/design/hadr/design.md index 32d03a7f0c..c6032a0a63 100644 --- a/docs/source/design/hadr/design.md +++ b/docs/source/design/hadr/design.md @@ -1,43 +1,28 @@ -![Corda](https://www.corda.net/wp-content/uploads/2016/11/fg005_corda_b.png) - -# High Availability Support for Corda: A Phased Approach - -------------------- -DOCUMENT MANAGEMENT -=================== - -## Document Control - -* High Availability and Disaster Recovery for Corda: A Phased Approach -* Date: 13th November 2017 -* Author: Matthew Nesbit -* Distribution: Design Review Board, Product Management, Services - Technical (Consulting), Platform Delivery -* Corda target version: Enterprise - -## Document Sign-off - -* Author: David Lee -* Reviewers(s): TBD -* Final approver(s): TBD - -## Document History - --------------------------------------------- -HIGH LEVEL DESIGN -============================================ +# High availability support ## Overview ### Background -The term high availability (HA) is used in this document to refer to the ability to rapidly handle any single component failure, whether due to physical issues (e.g. hard drive failure), network connectivity loss, or software faults. +The term high availability (HA) is used in this document to refer to the ability to rapidly handle any single component +failure, whether due to physical issues (e.g. hard drive failure), network connectivity loss, or software faults. -Expectations of HA in modern enterprise systems are for systems to recover normal operation in a few minutes at most, while ensuring minimal/zero data loss. Whilst overall reliability is the overriding objective, it is desirable for Corda to offer HA mechanisms which are both highly automated and transparent to node operators. HA mechanism must not involve any configuration changes that require more than an appropriate admin tool, or a simple start/stop of a process as that would need an Emergency Change Request. +Expectations of HA in modern enterprise systems are for systems to recover normal operation in a few minutes at most, +while ensuring minimal/zero data loss. Whilst overall reliability is the overriding objective, it is desirable for Corda +to offer HA mechanisms which are both highly automated and transparent to node operators. HA mechanism must not involve +any configuration changes that require more than an appropriate admin tool, or a simple start/stop of a process as that +would need an Emergency Change Request. -HA naturally grades into requirements for Disaster Recovery (DR), which requires that there is a tested procedure to handle large scale multi-component failures e.g. due to data centre flooding, acts of terrorism. DR processes are permitted to involve significant manual intervention, although the complications of actually invoking a Business Continuity Plan (BCP) mean that the less manual intervention, the more competitive Corda will be in the modern vendor market. -For modern financial institutions, maintaining comprehensive and effective BCP procedures are a legal requirement which is generally tested at least once a year. +HA naturally grades into requirements for Disaster Recovery (DR), which requires that there is a tested procedure to +handle large scale multi-component failures e.g. due to data centre flooding, acts of terrorism. DR processes are +permitted to involve significant manual intervention, although the complications of actually invoking a Business +Continuity Plan (BCP) mean that the less manual intervention, the more competitive Corda will be in the modern vendor +market. For modern financial institutions, maintaining comprehensive and effective BCP procedures are a legal +requirement which is generally tested at least once a year. -However, until Corda is the system of record, or the primary system for transactions we are unlikely to be required to have any kind of fully automatic DR. In fact, we are likely to be restarted only once BCP has restored the most critical systems. -In contrast, typical financial institutions maintain large, complex technology landscapes in which individual component failures can occur, such as: +However, until Corda is the system of record, or the primary system for transactions we are unlikely to be required to +have any kind of fully automatic DR. In fact, we are likely to be restarted only once BCP has restored the most critical +systems. In contrast, typical financial institutions maintain large, complex technology landscapes in which individual +component failures can occur, such as: * Small scale software failures * Mandatory data centre power cycles @@ -50,10 +35,11 @@ Thus, HA is essential for enterprise Corda and providing help to administrators ### Current node topology -![Current (single process)](./HA%20deployment%20-%20No%20HA.png) +![Current (single process)](./no-ha.png) -The current solution has a single integrated process running in one JVM including -Artemis, H2 database, Flow State Machine, P2P bridging. All storage is on the local file system. There is no HA capability other than manual restart of the node following failure. +The current solution has a single integrated process running in one JVM including Artemis, H2 database, Flow State +Machine, P2P bridging. All storage is on the local file system. There is no HA capability other than manual restart of +the node following failure. #### Limitations @@ -70,60 +56,81 @@ Artemis, H2 database, Flow State Machine, P2P bridging. All storage is on the lo ## Requirements ### Goals - * A logical Corda node should continue to function in the event of an individual component failure or (e.g.) restart. - * No loss, corruption or duplication of data on the ledger due to component outages - * Ensure continuity of flows throughout any disruption - * Support software upgrades in a live network -### Goals (out of scope for this design document) - * Be able to distribute a node over more than two datacenters. - * Be able to distribute a node between datacenters that are very far apart latency-wise (unless you don't care about performance). - * Be able to tolerate arbitrary byzantine failures within a node cluster. - * DR, specifically in the case of the complete failure of a site/datacentre/cluster or region will require a different solution to that specified here. For now DR is only supported where performant synchronous replication is feasible i.e. sites only a few miles apart. +* A logical Corda node should continue to function in the event of an individual component failure or (e.g.) restart. +* No loss, corruption or duplication of data on the ledger due to component outages +* Ensure continuity of flows throughout any disruption +* Support software upgrades in a live network + +### Non-goals (out of scope for this design document) + +* Be able to distribute a node over more than two datacenters. +* Be able to distribute a node between datacenters that are very far apart latency-wise (unless you don't care about performance). +* Be able to tolerate arbitrary byzantine failures within a node cluster. +* DR, specifically in the case of the complete failure of a site/datacentre/cluster or region will require a different + solution to that specified here. For now DR is only supported where performant synchronous replication is feasible + i.e. sites only a few miles apart. ## Timeline -This design document outlines a range of topologies which will be enabled through progressive enhancements from the short to long term. +This design document outlines a range of topologies which will be enabled through progressive enhancements from the +short to long term. -On the timescales available for the current production pilot deployments we clearly do not have time to reach the ideal of a highly fault tolerant, horizontally scaled Corda. +On the timescales available for the current production pilot deployments we clearly do not have time to reach the ideal +of a highly fault tolerant, horizontally scaled Corda. -Instead, I suggest that we can only achieve the simplest state of a standby Corda installation only by January 5th and even this is contingent on other enterprise features, such as external database and network map stabilisation being completed on this timescale, plus any issues raised by testing. +Instead, I suggest that we can only achieve the simplest state of a standby Corda installation only by January 5th and +even this is contingent on other enterprise features, such as external database and network map stabilisation being +completed on this timescale, plus any issues raised by testing. -For the March 31st timeline, I hope that we can achieve a more fully automatic node failover state, with the Artemis broker running as a cluster too. I include a diagram of a fully scaled Corda for completeness and so that I can discuss what work is re-usable/throw away. +For the Enterprise GA timeline, I hope that we can achieve a more fully automatic node failover state, with the Artemis +broker running as a cluster too. I include a diagram of a fully scaled Corda for completeness and so that I can discuss +what work is re-usable/throw away. -With regards to DR it is unclear how this would work where synchronous replication is not feasible. At this point we can only investigate approaches as an aside to the main thrust of work for HA support. In the synchronous replication mode it is assumed that the file and database replication can be used to ensure a cold DR backup. +With regards to DR it is unclear how this would work where synchronous replication is not feasible. At this point we can +only investigate approaches as an aside to the main thrust of work for HA support. In the synchronous replication mode +it is assumed that the file and database replication can be used to ensure a cold DR backup. ## Design Decisions The following design decisions are assumed by this design: -1. [Near-term-target](./decisions/near-term-target.md): Hot-Cold HA (see below) -2. [Medium-term target](./decisions/medium-term-target.md): Hot-Warm HA (see below) -3. [External broker](./decisions/external-broker.md): Yes -4. [Database message store](./decisions/db-msg-store.md): No -5. [IP addressing mechanism](./decisions/ip-addressing.md): Load balancer -6. [Crash shell start/stop](./decisions/crash-shell.md): No - - +.. toctree:: + :maxdepth: 1 + + decisions/near-term-target.md + decisions/medium-term-target.md + decisions/external-broker.md + decisions/db-msg-store.md + decisions/ip-addressing.md + decisions/crash-shell.md ## Target Solution - ### Hot-Cold (minimum requirement) -![Hot-Cold (minimum requirement)](./HA%20deployment%20-%20Hot-Cold.png) +![Hot-Cold (minimum requirement)](./hot-cold.png) -Small scale software failures on a node are recovered from locally via restarting/re-setting the offending component by the external (to JVM) "Health Watchdog" (HW) process. The HW process (eg a shell script or similar) would monitor parameters for java processes by periodically query them (sleep period a few seconds). This may require introduction of a few monitoring 'hooks' into Corda codebase or a "health" CorDapp the HW script can interface with. There would be a back-off logic to prevent continues restarts in the case of persistent failure. +Small scale software failures on a node are recovered from locally via restarting/re-setting the offending component by +the external (to JVM) "Health Watchdog" (HW) process. The HW process (eg a shell script or similar) would monitor +parameters for java processes by periodically query them (sleep period a few seconds). This may require introduction of +a few monitoring 'hooks' into Corda codebase or a "health" CorDapp the HW script can interface with. There would be a +back-off logic to prevent continues restarts in the case of persistent failure. We would provide a fully-functional sample HW script for Linux/Unix deployment platforms. -The hot-cold design provides a backup VM and Corda deployment instance that can be manually started if the primary is stopped. The failed primary must be killed to ensure it is fully stopped. +The hot-cold design provides a backup VM and Corda deployment instance that can be manually started if the primary is +stopped. The failed primary must be killed to ensure it is fully stopped. -For single-node deployment scenarios the simplest supported way to recover from failures is to re-start the entire set of Corda Node processes or reboot the node OS. +For single-node deployment scenarios the simplest supported way to recover from failures is to re-start the entire set +of Corda Node processes or reboot the node OS. -For a 2-node HA deployment scenario a load balancer determines which node is active and routes traffic to that node. -The load balancer will need to monitor the health of the primary and secondary nodes and automatically route traffic from the public IP address to the only active end-point. An external solution is required for the load balancer and health monitor. In the case of Azure cloud deployments, no custom code needs to be developed to support the health monitor. +For a 2-node HA deployment scenario a load balancer determines which node is active and routes traffic to that node. The +load balancer will need to monitor the health of the primary and secondary nodes and automatically route traffic from +the public IP address to the only active end-point. An external solution is required for the load balancer and health +monitor. In the case of Azure cloud deployments, no custom code needs to be developed to support the health monitor. -An additional component will be written to prevent accidental dual running which is likely to make use of a database heartbeat table. Code size should be minimal. +An additional component will be written to prevent accidental dual running which is likely to make use of a database +heartbeat table. Code size should be minimal. #### Advantages @@ -146,7 +153,7 @@ An additional component will be written to prevent accidental dual running which - Health reporting and process controls need to be developed by the customer. ### Hot-Warm (Medium-term solution) -![Hot-Warm (Medium-term solution)](./HA%20deployment%20-%20Hot-Warm.png) +![Hot-Warm (Medium-term solution)](./hot-warm.png) Hot-warm aims to automate failover and provide failover of individual major components e.g. Artemis. @@ -154,9 +161,10 @@ It involves Two key changes to the hot-cold design: 1) Separation and clustering of the Artemis broker. 2) Start and stop of flow processing without JVM exit. -The consequences of these changes are that peer to peer bridging is separated from the node and a bridge control protocol must be developed. -A leader election component is a pre-cursor to load balancing – likely to be a combination of custom code and standard library and, in the short term, is likely to be via the database. -Cleaner handling of disconnects from the external components (Artemis and the database) will also be needed. +The consequences of these changes are that peer to peer bridging is separated from the node and a bridge control +protocol must be developed. A leader election component is a pre-cursor to load balancing – likely to be a combination +of custom code and standard library and, in the short term, is likely to be via the database. Cleaner handling of +disconnects from the external components (Artemis and the database) will also be needed. #### Advantages @@ -175,13 +183,15 @@ Cleaner handling of disconnects from the external components (Artemis and the da - No horizontal scaling support. - Deployment of master and slave may not be completely symmetric. - Care must be taken with upgrades to ensure master/slave election operates across updates. -- Artemis clustering does require a designated master at start-up of its cluster hence any restart involving changing the primary node will require configuration management. +- Artemis clustering does require a designated master at start-up of its cluster hence any restart involving changing + the primary node will require configuration management. - The development effort is much more significant than the hot-cold configuration. ### Hot-Hot (Long-term strategic solution) -![Hot-Hot (Long-term strategic solution)](./HA%20deployment%20-%20Hot-Hot.png) +![Hot-Hot (Long-term strategic solution)](./hot-hot.png) -In this configuration, all nodes are actively processing work and share a clustered database. A mechanism for sharding or distributing the work load will need to be developed. +In this configuration, all nodes are actively processing work and share a clustered database. A mechanism for sharding +or distributing the work load will need to be developed. #### Advantages @@ -197,40 +207,76 @@ In this configuration, all nodes are actively processing work and share a cluste - Will require handling of more states than just checkpoints e.g. soft locks and RPC subscriptions. - Single flows will not be active on multiple nodes without future development work. --------------------------------------------- -IMPLEMENTATION PLAN -============================================ +## Implementation plan -## Transitioning from Corda 2.0 to Manually Activated HA +### Transitioning from Corda 2.0 to Manually Activated HA -The current Corda is built to run as a fully contained single process with the Flow logic, H2 database and Artemis broker all bundled together. This limits the options for automatic replication, or subsystem failure. Thus, we must use external mechanisms to replicate the data in the case of failure. We also should ensure that accidental dual start is not possible in case of mistakes, or slow shutdown of the primary. +The current Corda is built to run as a fully contained single process with the Flow logic, H2 database and Artemis +broker all bundled together. This limits the options for automatic replication, or subsystem failure. Thus, we must use +external mechanisms to replicate the data in the case of failure. We also should ensure that accidental dual start is +not possible in case of mistakes, or slow shutdown of the primary. Based on this situation, I suggest the following minimum development tasks are required for a tested HA deployment: -1. Complete and merge JDBC support for an external clustered database. Azure SQL Server has been identified as the most likely initial deployment. With this we should be able to point at an HA database instance for Ledger and Checkpoint data. -2. I am suggesting that for the near term we just use the Azure Load Balancer to hide the multiple machine addresses. This does require allowing a health monitoring link to the Artemis broker, but so far testing indicates that this operates without issue. Longer term we need to ensure that the network map and configuration support exists for the system to work with multiple TCP/IP endpoints advertised to external nodes. Ideally this should be rolled into the work for AMQP bridges and Floats. -3. Implement a very simple mutual exclusion feature, so that an enterprise node cannot start if another is running onto the same database. This can be via a simple heartbeat update in the database, or possibly some other library. This feature should be enabled only when specified by configuration. -4. The replication of the Artemis Message Queues will have to be via an external mechanism. On Azure we believe that the only practical solution is the 'Azure Files' approach which maps a virtual Samba drive. This we are testing in-case it is too slow to work. The mounting of separate Data Disks is possible, but they can only be mounted to one VM at a time, so they would not be compatible with the goal of no change requests for HA. -5. Improve health monitoring to better indicate fault failure. Extending the existing JMX and logging support should achieve this, although we probably need to create watchdog CordApp that verifies that the State Machine and Artemis messaging are able to process new work and to monitor flow latency. -6. Test the checkpointing mechanism and confirm that failures don't corrupt the data by deploying an HA setup on Azure and driving flows through the system as we stop the node randomly and switch to the other node. If this reveals any issues we will have to fix them. -7. Confirm that the behaviour of the RPC Client API is stable through these restarts, from the perspective of a stateless REST server calling through to RPC. The RPC API should provide positive feedback to the application, so that it can respond in a controlled fashion when disconnected. +1. Complete and merge JDBC support for an external clustered database. Azure SQL Server has been identified as the most + likely initial deployment. With this we should be able to point at an HA database instance for Ledger and Checkpoint data. +2. I am suggesting that for the near term we just use the Azure Load Balancer to hide the multiple machine addresses. + This does require allowing a health monitoring link to the Artemis broker, but so far testing indicates that this + operates without issue. Longer term we need to ensure that the network map and configuration support exists for the + system to work with multiple TCP/IP endpoints advertised to external nodes. Ideally this should be rolled into the + work for AMQP bridges and Floats. +3. Implement a very simple mutual exclusion feature, so that an enterprise node cannot start if another is running onto + the same database. This can be via a simple heartbeat update in the database, or possibly some other library. This + feature should be enabled only when specified by configuration. +4. The replication of the Artemis Message Queues will have to be via an external mechanism. On Azure we believe that the + only practical solution is the 'Azure Files' approach which maps a virtual Samba drive. This we are testing in-case it + is too slow to work. The mounting of separate Data Disks is possible, but they can only be mounted to one VM at a + time, so they would not be compatible with the goal of no change requests for HA. +5. Improve health monitoring to better indicate fault failure. Extending the existing JMX and logging support should + achieve this, although we probably need to create watchdog CordApp that verifies that the State Machine and Artemis + messaging are able to process new work and to monitor flow latency. +6. Test the checkpointing mechanism and confirm that failures don't corrupt the data by deploying an HA setup on Azure + and driving flows through the system as we stop the node randomly and switch to the other node. If this reveals any + issues we will have to fix them. +7. Confirm that the behaviour of the RPC Client API is stable through these restarts, from the perspective of a stateless + REST server calling through to RPC. The RPC API should provide positive feedback to the application, so that it can + respond in a controlled fashion when disconnected. 8. Work on flow hospital tools where needed -## Moving Towards Automatic Failover HA +### Moving Towards Automatic Failover HA -To move towards more automatic failover handling we need to ensure that the node can be partially active i.e. live monitoring the health status and perhaps keeping major data structures in sync for faster activation, but not actually processing flows. This needs to be reversible without leakage, or destabilising the node as it is common to use manually driven master changes to help with software upgrades and to carry out regular node shutdown and maintenance. Also, to reduce the risks associated with the uncoupled replication of the Artemis message data and the database I would recommend that we move the Artemis broker out of the node to allow us to create a failover cluster. This is also in line with the goal of creating a AMQP bridges and Floats. +To move towards more automatic failover handling we need to ensure that the node can be partially active i.e. live +monitoring the health status and perhaps keeping major data structures in sync for faster activation, but not actually +processing flows. This needs to be reversible without leakage, or destabilising the node as it is common to use manually +driven master changes to help with software upgrades and to carry out regular node shutdown and maintenance. Also, to +reduce the risks associated with the uncoupled replication of the Artemis message data and the database I would +recommend that we move the Artemis broker out of the node to allow us to create a failover cluster. This is also in line +with the goal of creating a AMQP bridges and Floats. To this end I would suggest packages of work that include: -1. Move the broker out of the node, which will require having a protocol that can be used to signal bridge creation and which decouples the network map. This is in line with the Flow work anyway. -2. Create a mastering solution, probably using Atomix.IO although this might require a solution with a minimum of three nodes to avoid split brain issues. Ideally this service should be extensible in the future to lead towards an eventual state with Flow level sharding. Alternatively, we may be able to add a quick enterprise adaptor to ZooKeeper as master selector if time is tight. This will inevitably impact upon configuration and deployment support. +1. Move the broker out of the node, which will require having a protocol that can be used to signal bridge creation and + which decouples the network map. This is in line with the Flow work anyway. +2. Create a mastering solution, probably using Atomix.IO although this might require a solution with a minimum of three + nodes to avoid split brain issues. Ideally this service should be extensible in the future to lead towards an eventual + state with Flow level sharding. Alternatively, we may be able to add a quick enterprise adaptor to ZooKeeper as + master selector if time is tight. This will inevitably impact upon configuration and deployment support. 3. Test the leakage when we repeated start-stop the Node class and fix any resource leaks, or deadlocks that occur at shutdown. -4. Switch the Artemis client code to be able to use the HA mode connection type and thus take advantage of the rapid failover code. Also, ensure that we can support multiple public IP addresses reported in the network map. -5. Implement proper detection and handling of disconnect from the external database and/or Artemis broker, which should immediately drop the master status of the node and flush any incomplete flows. -6. We should start looking at how to make RPC proxies recover from disconnect/failover, although this is probably not a top priority. However, it would be good to capture the missed results of completed flows and ensure the API allows clients to unregister/re-register Observables. +4. Switch the Artemis client code to be able to use the HA mode connection type and thus take advantage of the rapid + failover code. Also, ensure that we can support multiple public IP addresses reported in the network map. +5. Implement proper detection and handling of disconnect from the external database and/or Artemis broker, which should + immediately drop the master status of the node and flush any incomplete flows. +6. We should start looking at how to make RPC proxies recover from disconnect/failover, although this is probably not a + top priority. However, it would be good to capture the missed results of completed flows and ensure the API allows + clients to unregister/re-register Observables. ## The Future -Hopefully, most of the work from the automatic failover mode can be modified when we move to a full hot-hot sharding of flows across nodes. The mastering solution will need to be modified to negotiate finer grained claim on individual flows, rather than stopping the whole of Node. Also, the routing of messages will have to be thought about so that they go to the correct node for processing, but failover if the node dies. However, most of the other health monitoring and operational aspects should be reusable. +Hopefully, most of the work from the automatic failover mode can be modified when we move to a full hot-hot sharding of +flows across nodes. The mastering solution will need to be modified to negotiate finer grained claim on individual +flows, rather than stopping the whole of Node. Also, the routing of messages will have to be thought about so that they +go to the correct node for processing, but failover if the node dies. However, most of the other health monitoring and +operational aspects should be reusable. -We also need to look at DR issues and in particular how we might handle asynchronous replication and possibly alternative recovery/reconciliation mechanisms. +We also need to look at DR issues and in particular how we might handle asynchronous replication and possibly +alternative recovery/reconciliation mechanisms. diff --git a/docs/source/design/hadr/HA deployment - Hot-Cold.png b/docs/source/design/hadr/hot-cold.png similarity index 100% rename from docs/source/design/hadr/HA deployment - Hot-Cold.png rename to docs/source/design/hadr/hot-cold.png diff --git a/docs/source/design/hadr/HA deployment - Hot-Hot.png b/docs/source/design/hadr/hot-hot.png similarity index 100% rename from docs/source/design/hadr/HA deployment - Hot-Hot.png rename to docs/source/design/hadr/hot-hot.png diff --git a/docs/source/design/hadr/HA deployment - Hot-Warm.png b/docs/source/design/hadr/hot-warm.png similarity index 100% rename from docs/source/design/hadr/HA deployment - Hot-Warm.png rename to docs/source/design/hadr/hot-warm.png diff --git a/docs/source/design/hadr/HA deployment - No HA.png b/docs/source/design/hadr/no-ha.png similarity index 100% rename from docs/source/design/hadr/HA deployment - No HA.png rename to docs/source/design/hadr/no-ha.png diff --git a/docs/source/index.rst b/docs/source/index.rst index 4a810feb63..58a87c71b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -60,6 +60,7 @@ We look forward to seeing what you can do with Corda! design/certificate-hierarchies/design.md design/failure-detection-master-election/design.md design/float/design.md + design/hadr/design.md .. toctree:: :caption: Participate