diff --git a/.taskmaster/tasks/tasks.json b/.taskmaster/tasks/tasks.json index 1a96f94..1e1f0a8 100644 --- a/.taskmaster/tasks/tasks.json +++ b/.taskmaster/tasks/tasks.json @@ -337,7 +337,7 @@ "details": "Source audit: .taskmaster/docs/audit-backend-stack-security-and-refactor-assessment-2026-05-24.md. This task is advisory/architecture-focused and should run in parallel with immediate hardening. It should produce the decision artifacts needed before any backend-core rewrite or provider migration is started.", "testStrategy": "Review and sign off each architecture document with backend, payments, frontend, and operations stakeholders. Confirm every open question has an owner or explicit deferred decision before implementation work begins.", "priority": "high", - "status": "in-progress", + "status": "done", "dependencies": [], "subtasks": [ { @@ -371,52 +371,56 @@ "title": "Specify funds ledger and escrow state machine", "description": "Define canonical money movement and legal state transitions before refactor or provider migration.", "details": "Create specs for FundsAccount, LedgerEntry, FundsBalance, gross paid, provider fees, platform fees, held, disputed, releasable, released, refunded, idempotency keys, reconciliation behavior, purchase request states, payment states, escrow/funds states, dispute states, valid transitions, forbidden transitions, and release/refund/admin override preconditions.", - "status": "pending", + "status": "done", "priority": "high", "dependencies": [ 2 ], "testStrategy": "Spec can be used to reject double-release, release-during-dispute, underfunded payout, and ambiguous provider-event scenarios.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:23:41.596Z" }, { "id": 4, "title": "Create authorization matrix for REST and Socket.IO", "description": "Map every endpoint and realtime event to access level, ownership checks, state preconditions, rate-limit tier, and audit-log requirement.", "details": "Include public/authenticated/owner/buyer/seller/admin/support/service-role classifications. Socket.IO rooms must be server-derived from authenticated identity, not client-supplied user IDs.", - "status": "pending", + "status": "done", "priority": "high", "dependencies": [ 2 ], "testStrategy": "No route or socket event remains unmapped; implementation tasks can reference matrix rows directly.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:23:43.108Z" }, { "id": 5, "title": "Decide session, passkey, and admin step-up architecture", "description": "Choose browser session model and high-risk admin authentication requirements.", "details": "Decide localStorage versus httpOnly cookies, access/refresh token lifetimes, CSRF strategy, refresh rotation, WebAuthn requirements, OAuth requirements, device/session revocation, and whether payouts/role changes require step-up authentication or two-person approval.", - "status": "pending", + "status": "done", "priority": "high", "dependencies": [ 2 ], "testStrategy": "Decision record lists chosen model, rejected alternatives, migration cost, and required implementation tasks.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:23:44.643Z" }, { "id": 6, "title": "Specify webhook security and provider adapter contracts", "description": "Define provider-neutral payment interface and signed webhook processing rules.", "details": "Document createPayInIntent, getPayInStatus, handleProviderWebhook, createHostedPaymentLink, createReleaseInstruction, createRefundInstruction, getPayoutStatus, searchProviderPayments, raw-body signature verification, replay prevention, delivery ID idempotency, duplicate/unknown event behavior, retry semantics, dead-letter/replay storage, and alert thresholds.", - "status": "pending", + "status": "done", "priority": "high", "dependencies": [ 3 ], "testStrategy": "Contracts cover SHKeeper legacy, Request Network, manual/admin wallet, invalid signatures, duplicate deliveries, and missed webhook reconciliation.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:21:42.699Z" }, { "id": 7, @@ -437,7 +441,7 @@ "title": "Make backend-core stack decision", "description": "Choose whether the security-critical backend core remains TypeScript or moves to Go/Kotlin/Rust/Python.", "details": "Evaluate team capability, two-year maintainability, operational footprint, rewrite cost, dual-stack complexity, auditability, supply-chain exposure, and which modules belong in a payment/auth/escrow core versus the existing marketplace/chat API.", - "status": "pending", + "status": "done", "priority": "medium", "dependencies": [ 2, @@ -448,23 +452,25 @@ 7 ], "testStrategy": "Architecture decision record states chosen stack, scope of extraction, non-goals, migration phases, rollback criteria, and owners.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:21:45.258Z" }, { "id": 9, "title": "Create migration and operational runbooks", "description": "Document rollout, rollback, and incident response for the selected backend/funds architecture.", "details": "Include SHKeeper legacy read path, provider feature flag, ledger backfill, validation report before enforcement, rollback criteria, webhook cutoff, manual reconciliation, failed webhook, duplicate/missing payment, stuck release, disputed release attempt, compromised admin, leaked API key, provider outage, chain/RPC outage, suspicious payment proof, and npm/package compromise.", - "status": "pending", + "status": "done", "priority": "medium", "dependencies": [ 8 ], "testStrategy": "Runbooks identify owner, trigger, detection signal, immediate action, recovery action, and post-incident documentation for each scenario.", - "parentId": "undefined" + "parentId": "undefined", + "updatedAt": "2026-05-24T07:21:47.810Z" } ], - "updatedAt": "2026-05-24T06:43:04.699Z" + "updatedAt": "2026-05-24T07:23:44.643Z" }, { "id": "5", @@ -612,12 +618,12 @@ ], "metadata": { "version": "1.0.0", - "lastModified": "2026-05-24T07:04:01.906Z", + "lastModified": "2026-05-24T07:23:44.643Z", "taskCount": 5, - "completedCount": 2, + "completedCount": 4, "tags": [ "master" ] } } -} +} \ No newline at end of file diff --git a/08 - Operations/Backend Funds Migration and Operational Runbooks.md b/08 - Operations/Backend Funds Migration and Operational Runbooks.md new file mode 100644 index 0000000..d940c48 --- /dev/null +++ b/08 - Operations/Backend Funds Migration and Operational Runbooks.md @@ -0,0 +1,197 @@ +--- +title: Backend Funds Migration and Operational Runbooks +tags: [operations, payments, migration, incidents] +created: 2026-05-24 +--- + +# Backend Funds Migration and Operational Runbooks + +These runbooks cover the selected backend/funds architecture defined in +[[Backend Core Stack Decision Record - 2026-05-24]]. + +## 1. Migration runbook (legacy + provider migration) + +### 1.1 Preflight + +1. Verify provider credentials and webhook secrets are present in production `.env`. +2. Confirm canonical state docs are published and approved: + - [[Funds Ledger and Escrow State Machine Specification]] + - [[Payment Provider Adapter Spec]] + - [[Webhook Security Spec]] +3. Run validation report: + - count of legacy `Payment` rows by provider, + - duplicate provider keys, + - webhook processing success/error by provider (24h), + - unresolved `pending`/`processing` payments older than 30m. +4. Run a staging reconciliation simulation against a read-only copy. + +### 1.2 Rollout sequence + +1. Keep `PAYMENT_MODE=read_only` and keep SHKeeper intent path active. +2. Enable legacy+new provider dual routing via `PAYMENT_ENABLED_PROVIDERS`. +3. Activate adapter contract metrics (`payment-provider-events` and webhook counters) before any write routing. +4. Set `PAYMENT_PROVIDER_MODE=standard` for a 10–15% cohort. +5. Expand to full traffic only after two clean 24h windows. + +### 1.3 Legacy read path and backfill + +- Keep SHKeeper legacy read path enabled for at least one billing cycle after migration starts. +- Backfill task: + - map legacy SHKeeper records into `FundsAccount` references, + - write reconciliation notes into `metadata.providers`, + - leave source payment rows immutable. +- Do not delete legacy routing until backfill report is complete and reviewed. + +### 1.4 Validation before enforcement + +Before switching default provider: + +- No critical webhook verification failures for 24h. +- DLQ volume stable for 2h. +- Provider status mismatch rate `< 0.5%` for 1000 samples. +- Manual reconciliation sample (`>= 10` rows) completed by two operators. + +### 1.5 Rollback criteria + +Rollback immediately if any of: + +- payout/hold/release path returns invariant violation, +- unresolved webhook failure burst above threshold, +- chain reconciliation mismatch above threshold from runbook table. + +Rollback command sequence: + +1. `PAYMENT_ENABLED_PROVIDERS=shkeeper` +2. `PAYMENT_DEFAULT_PROVIDER=shkeeper` +3. `PAYMENT_PROVIDER_MODE=read_only` (pause new routing), +4. keep reconciliation active in dry-run mode, +5. post-incident note in this doc + [[Incident Response]]. + +### 1.6 Webhook cutoff and manual reconciliation + +- Keep auto-accepting historical webhook traffic through cutover. +- Set explicit cutoff only if new provider writes must stop: + 1. pause new pay-in intents on new provider, + 2. continue `handleProviderWebhook` in `read_only` mode, + 3. run manual reconciliation for open windows, + 4. re-enable normal routing after backlog clear or explicit incident decision. + +## 2. Scenario runbooks + +Each incident scenario uses the same structure: +`owner`, `trigger`, `detection`, `immediate`, `recovery`, `post`. + +### 2.1 Failed webhook +- **Owner:** BL + DL +- **Trigger:** `webhook_failures` alert > warning threshold (see [[Webhook Security Spec]]). +- **Detection:** spike in 5xx/400 from callback route, DLQ growth. +- **Immediate:** isolate by provider; keep `PAYMENT_PROVIDER_MODE=read_only` for provider if mismatch continues. +- **Recovery:** re-process DLQ entries after fix; verify idempotent behavior; compare reconciliation counters. +- **Post:** update webhook retry tuning and security thresholds. + +### 2.2 Duplicate payment +- **Owner:** BL +- **Trigger:** duplicate webhook deliveries or idempotent suppression above normal baseline. +- **Detection:** `duplicate` counter above expected and no payment state change. +- **Immediate:** confirm normalized status transition map; suppress with `deliveryId`/`idempotency` checks. +- **Recovery:** rerun reconciliation for affected IDs and clear manual holds if ledger still consistent. +- **Post:** validate provider event parsing and storage hashing. + +### 2.3 Missing payment +- **Owner:** BL + Operations +- **Trigger:** customer support report or reconciliation finding. +- **Detection:** provider reference exists, no canonical payment row or no final ledger balance. +- **Immediate:** freeze payouts/release for request ID; run `searchProviderPayments`. +- **Recovery:** create missing read-model entry or corrective ledger adjustment after evidence capture. +- **Post:** add missing-test case for same provider/providerReference combination. + +### 2.4 Stuck release +- **Owner:** BL + Ops +- **Trigger:** `released` pending >30m. +- **Detection:** payout task with no terminal transition for `release`/`refund`. +- **Immediate:** confirm chain confirmation and provider payout status. +- **Recovery:** if confirmed on-chain but not reflected, run manual reconciliation + status repair; else reissue payout instruction. +- **Post:** update alerting and retry windows. + +### 2.5 Disputed release attempt +- **Owner:** Security + BL + Admin +- **Trigger:** release/recover API invoked while dispute hold is active. +- **Detection:** `disputeId` active + attempted release/refund action. +- **Immediate:** block action, notify admin team and dispute owner. +- **Recovery:** clear dispute hold only by approved dispute resolution path. +- **Post:** enforce ownership check in orchestration and admin UI guardrail. + +### 2.6 Compromised admin +- **Owner:** Security Lead +- **Trigger:** unexpected high-value payouts or admin credentials alert. +- **Detection:** step-up auth failures, role anomalies, IP anomalies. +- **Immediate:** suspend admin account, rotate sessions, set `PAYMENT_PROVIDER_MODE=read_only`. +- **Recovery:** review all recent ledger entries and release/refund actions; reverse via admin correction only after legal/ownership approval. +- **Post:** enforce 2-person approval for high-risk payout thresholds and step-up audit check. + +### 2.7 Leaked API key / webhook secret +- **Owner:** Security + DevOps +- **Trigger:** secret scanner, log leak, or suspected rotation request. +- **Detection:** unauthorized provider 401s, unknown callbacks, or external exposure confirmation. +- **Immediate:** rotate affected key and disable old key immediately. +- **Recovery:** replay missed webhooks in dry-run mode after trust restored; compare reconciliation totals. +- **Post:** document incident and rotate staging/CI secret references. + +### 2.8 Provider outage +- **Owner:** DevOps + BL +- **Trigger:** repeated provider 5xx/timeouts. +- **Detection:** error budget alert and queue growth. +- **Immediate:** switch `PAYMENT_ENABLED_PROVIDERS=shkeeper` or `read_only`, notify users of degraded mode. +- **Recovery:** drain queue and replay DLQ when provider recovers. +- **Post:** update provider outage SLA runbook and status page text. + +### 2.9 Chain/RPC outage +- **Owner:** BL + DevOps +- **Trigger:** chain verification failures and elevated wallet lookup latency. +- **Detection:** verification timeout/error and `chain_verification_stale` trend. +- **Immediate:** pause non-critical chain-dependent actions; preserve pending proofs. +- **Recovery:** rerun verification jobs against fallback RPC endpoint. +- **Post:** update RPC provider fallback policy and retry tuning. + +### 2.10 Suspicious payment proof +- **Owner:** Security + BL +- **Trigger:** tx hash proof with mismatched `to`, amount, or network. +- **Detection:** verify script rejects or manual proof review. +- **Immediate:** mark payment disputed, require operator review. +- **Recovery:** keep payment in `pending`, notify buyer and seller, trigger manual support workflow. +- **Post:** add automated proof assertions to webhook and verifier tests. + +### 2.11 npm/package compromise +- **Owner:** CTO + Security +- **Trigger:** new high/critical advisory tied to runtime packages. +- **Detection:** audit alerts or security team notification. +- **Immediate:** pause releases in affected components; freeze `watchtower` auto-update if needed. +- **Recovery:** patch dependency and redeploy, then run smoke tests and transaction smoke path. +- **Post:** verify lockfile, provenance, and dependency policy compliance. + +## 3. Incident owner roster + +Production launch must replace these role owners with named responders in the +primary on-call schedule. Until then, escalation is by role: + +- Payments: backend lead for payment/ledger services +- Security: security owner from [[Security Ownership and Launch Decision Criteria]] +- DevOps: deployment owner for production infrastructure +- Admin lead: operations lead for dispute and payout approval workflows + +## 4. Post-incident mandatory actions + +- incident log + root-cause and action items, +- metric threshold updates if needed, +- runbook improvements (this document), +- verification that [[Monitoring]] and [[Incident Response]] links are still accurate. + +## Related + +- [[Incident Response]] +- [[Monitoring]] +- [[Deployment]] +- [[Backup & Recovery]] +- [[Webhook Security Spec]] +- [[Payment Provider Adapter Spec]] +- [[Backend Core Stack Decision Record - 2026-05-24]] diff --git a/09 - Audits/Authorization Matrix - REST and Socket.IO.md b/09 - Audits/Authorization Matrix - REST and Socket.IO.md index 182b023..701eaa9 100644 --- a/09 - Audits/Authorization Matrix - REST and Socket.IO.md +++ b/09 - Audits/Authorization Matrix - REST and Socket.IO.md @@ -37,7 +37,7 @@ reviewers: [backend, security] | **Admin** | Authenticated + `req.user.role === 'admin'`. All admin actions MUST be audit-logged. | `authenticateToken` + `roleGuard('admin')` or `authorizeRoles('admin')`. | | **Support** | Authenticated + `req.user.role === 'support'`. Read-only access to user data, dispute records, and chat. Can reset passwords and escalate to admin. Cannot modify financial records or release funds. | `authenticateToken` + `roleGuard('support')`. Controller must enforce read-only constraint. | | **Service** | Internal service-to-service calls. Authenticated via shared secret (`X-Internal-Secret` header) or restricted to localhost network. Not user-facing. | Custom middleware verifying internal header or `req.ip === '127.0.0.1'`. | -| **Step-up** | Admin + re-authenticated within last 15 minutes (configurable). Required for: payout creation/release, role changes, large refunds (>$100), user deletion, admin-wallet signing. | `authenticateToken` + `roleGuard('admin')` + step-up timestamp check from Redis session. | +| **Step-up** | Admin + re-authenticated within configured window (default 5 minutes). Required for high-risk admin actions (role changes, user deletion, payout/release, manual overrides, sensitive wallet operations). | `authenticateToken` + `roleGuard('admin')` + step-up timestamp check from Redis session. | | **HMAC** | No user auth. Verified via HMAC-SHA256 signature on raw body using `SHKEEPER_WEBHOOK_SECRET`. Signature-verified, not identity-verified. | `express.raw()` body parser + timing-safe HMAC comparison. | --- @@ -83,6 +83,10 @@ reviewers: [backend, security] | AUTH-R022 | PUT | /api/auth/profile | Authenticated | Owner | None | Tier 3 | No | Auth enforced. | Authenticated | | | AUTH-R023 | POST | /api/auth/update-profile | Authenticated | Owner | None | Tier 3 | No | Auth enforced. Legacy alias. | Authenticated | Duplicate of R022. | | AUTH-R024 | DELETE | /api/auth/account | Authenticated | Owner | Password re-verified | Tier 3 | Yes | Auth + password required. | Authenticated + audit | Permanent deletion. | +| AUTH-R025 | POST | /api/auth/step-up | Admin | None | Valid challenge context or credentials | Tier 6 | Yes | Not implemented | Admin + Step-up | Required by ADR for high-risk admin actions. Creates 5-minute elevated session in Redis. | +| AUTH-R026 | GET | /api/auth/sessions | Authenticated | Owner | Current refresh session exists | Tier 3 | Yes | Not implemented | Authenticated | Returns active sessions with device, IP, and session age. | +| AUTH-R027 | POST | /api/auth/revoke-session | Authenticated | Owner | Target session belongs to user | Tier 3 | Yes | Not implemented | Authenticated + audit | Revokes one session by sessionTokenHash. | +| AUTH-R028 | POST | /api/auth/revoke-all-sessions | Authenticated | Owner | Multiple active sessions loaded | Tier 3 | Yes | Not implemented | Authenticated + audit | Revokes all sessions except current. | ### 2.2 User Routes @@ -116,6 +120,14 @@ reviewers: [backend, security] | UADM-R012 | PATCH | /api/users/admin/:userId/password | Admin | None | Target user exists | Tier 6 | Yes | Inline role check. | Admin + Step-up + audit | Wipes all sessions. | | UADM-R013 | POST | /api/users/admin/:userId/resend-verification | Admin | None | User not already verified | Tier 6 | Yes | Inline role check. | Admin + audit | Triggers email. | +### 2.3A Admin Approval Routes + +| ID | Method | Path | Access Level | Ownership Check | State Preconditions | Rate-Limit Tier | Audit Log | Current State | Required State | Notes | +|---|---|---|---|---|---|---|---|---|---|---| +| APV-R001 | GET | /api/admin/approvals | Admin | None | None | Tier 6 | Yes | Not implemented | Admin + Step-up | Pending approval queue for high-value actions. | +| APV-R002 | POST | /api/admin/approvals/{id}/confirm | Admin | None | Approval exists, status = PENDING, approver != creator | Tier 6 | Yes | Not implemented | Admin + Step-up + audit | Confirms pending approval and executes action. | +| APV-R003 | POST | /api/admin/approvals/{id}/reject | Admin | None | Approval exists, status = PENDING | Tier 6 | Yes | Not implemented | Admin + Step-up + audit | Rejects pending approval and records reason. | + ### 2.4 Address Routes | ID | Method | Path | Access Level | Ownership Check | State Preconditions | Rate-Limit Tier | Audit Log | Current State | Required State | Notes | @@ -259,16 +271,16 @@ reviewers: [backend, security] | ID | Method | Path | Access Level | Ownership Check | State Preconditions | Rate-Limit Tier | Audit Log | Current State | Required State | Notes | |---|---|---|---|---|---|---|---|---|---|---| -| REL-R001 | POST | /api/payment/shkeeper/:id/release | Admin | None | Payment funded; no active dispute (T06); escrowState=funded | Tier 6 | Yes | Auth enforced. Admin. NO dispute check. T06. | Admin + Step-up + dispute check + audit | Builds release tx payload. | +| REL-R001 | POST | /api/payment/shkeeper/:id/release | Admin | None | Payment funded; no active dispute (T06); escrowState=funded | Tier 6 | Yes | Auth enforced. Admin. NO dispute check. T06. | Admin + Step-up + dispute check + audit, + two-person approval for payout > 1000 USD equivalent (see APV-R002/APV-R003) | Builds release tx payload. | | REL-R002 | POST | /api/payment/shkeeper/:id/release/confirm | Admin | None | Release tx pending; valid txHash | Tier 6 | Yes | Auth enforced. Admin. | Admin + Step-up + audit | Confirms release on-chain. | -| REL-R003 | POST | /api/payment/shkeeper/:id/refund | Admin | None | Payment funded; no active dispute; escrowState=funded | Tier 6 | Yes | Auth enforced. Admin. NO dispute check. T06. | Admin + Step-up + dispute check + audit | Builds refund tx. | +| REL-R003 | POST | /api/payment/shkeeper/:id/refund | Admin | None | Payment funded; no active dispute; escrowState=funded | Tier 6 | Yes | Auth enforced. Admin. NO dispute check. T06. | Admin + Step-up + dispute check + audit, + two-person approval for payout > 1000 USD equivalent (see APV-R002/APV-R003) | Builds refund tx. | | REL-R004 | POST | /api/payment/shkeeper/:id/refund/confirm | Admin | None | Refund tx pending; valid txHash | Tier 6 | Yes | Auth enforced. Admin. | Admin + Step-up + audit | Confirms refund on-chain. | ### 2.15 Payment Routes (SHKeeper Payout) | ID | Method | Path | Access Level | Ownership Check | State Preconditions | Rate-Limit Tier | Audit Log | Current State | Required State | Notes | |---|---|---|---|---|---|---|---|---|---|---| -| PO-R001 | POST | /api/payment/shkeeper/payout | Admin | None | No existing pending payout for same escrow | Tier 6 | Yes | Auth enforced. Admin. | Admin + Step-up + audit | Creates payout task. T05. | +| PO-R001 | POST | /api/payment/shkeeper/payout | Admin | None | No existing pending payout for same escrow | Tier 6 | Yes | Auth enforced. Admin. | Admin + Step-up + audit, + two-person approval for payout > 1000 USD equivalent (see APV-R002/APV-R003) | Creates payout task. T05. | | PO-R002 | GET | /api/payment/shkeeper/payout/status/:taskId | Authenticated | Owner or Admin | Task exists | Tier 3 | No | Auth enforced. | Authenticated | Poll payout status. | | PO-R003 | POST | /api/payment/shkeeper/payout/webhook | HMAC | None | Signature valid | Tier 5 | Yes | HMAC verification. | HMAC + audit | Payout state changes. | @@ -630,9 +642,10 @@ These gaps involve audit logging and presence tracking. They are important for o | Route Group | Endpoints | |---|---| -| Auth | 24 | +| Auth | 28 | | User | 9 | | User Admin | 13 | +| Admin Approval | 3 | | Address | 5 | | Purchase Request | 18 | | Delivery Code | 4 | @@ -656,7 +669,7 @@ These gaps involve audit logging and presence tracking. They are important for o | File | 9 | | Admin Cleanup | 7 | | System | 2 | -| **Total REST Endpoints** | **248** | +| **Total REST Endpoints** | **255** | ### Socket.IO Event Count @@ -691,4 +704,4 @@ These gaps involve audit logging and presence tracking. They are important for o --- -*This document was produced on 2026-05-24 as part of the Amanat authorization audit. It must be updated when: new endpoints are added, existing endpoint access levels change, new Socket.IO events are introduced, or the role model is extended. Implementation tasks should reference specific AUTH-R, USER-R, UADM-R, ADDR-R, PR-R, DC-R, OFF-R, TPL-R, SHOP-R, CAT-R, REV-R, PAY-R, SHK-R, REL-R, PO-R, DEC-R, MPAY-R, CHAT-R, NOTIF-R, DIS-R, AI-R, BLOG-R, PTS-R, FILE-R, ADM-R, SYS-R, and SOCK-E IDs from this matrix.* +*This document was produced on 2026-05-24 as part of the Amanat authorization audit. It must be updated when: new endpoints are added, existing endpoint access levels change, new Socket.IO events are introduced, or the role model is extended. Implementation tasks should reference specific AUTH-R, USER-R, UADM-R, APV-R, ADDR-R, PR-R, DC-R, OFF-R, TPL-R, SHOP-R, CAT-R, REV-R, PAY-R, SHK-R, REL-R, PO-R, DEC-R, MPAY-R, CHAT-R, NOTIF-R, DIS-R, AI-R, BLOG-R, PTS-R, FILE-R, ADM-R, SYS-R, and SOCK-E IDs from this matrix.* diff --git a/09 - Audits/Backend Core Stack Decision Record - 2026-05-24.md b/09 - Audits/Backend Core Stack Decision Record - 2026-05-24.md new file mode 100644 index 0000000..a01b2e2 --- /dev/null +++ b/09 - Audits/Backend Core Stack Decision Record - 2026-05-24.md @@ -0,0 +1,117 @@ +--- +title: Backend Core Stack Decision Record - 2026-05-24 +tags: [adr, architecture, backend] +created: 2026-05-24 +status: approved +reviewers: [CTO, backend, security] +--- + +# Backend Core Stack Decision Record - 2026-05-24 + +## 1. Decision + +Keep the security-critical backend core on **TypeScript/Node** in the first 12 months. + +Do **not** perform a full greenfield rewrite before the payment/auth/escrow core is fully specified and observable. + +## 2. Why this stack (today) + +The highest current risk is not framework selection; it is **financial state correctness**. + +For the next phase, the team needs: + +- provider-neutral payment abstraction, +- immutable funds ledger, +- webhook hardening and reconciliation, +- strict dispute hold behavior, +- admin step-up controls, +- production-grade operational runbooks. + +Moving to Go/Kotlin/Rust now would preserve existing risks while adding migration uncertainty and a delay in launch-readiness. + +TypeScript remains the fastest way to ship the required controls while keeping operational visibility and team velocity. + +## 3. Scope of extraction + +The β€œbackend core” now means: + +- `Payment` orchestration and payout/release state transitions, +- auth/session validation for financial actions, +- webhook intake and reconciliation, +- ledger-derived escrow eligibility checks, +- admin-risk operations (payout/refund/adjustment). + +These modules stay in the same service boundary during migration-in-place, but all calls must go through: + +- [[Payment Provider Adapter Spec]] +- [[Webhook Security Spec]] +- [[Funds Ledger and Escrow State Machine Specification]] + +Non-core modules remain where they are: + +- marketplace browsing, templates, shop settings, +- chat and notifications, +- file uploads/downloads. + +## 4. Evaluation of alternatives + +### Go + +- **Pros:** smaller runtime and dependency surface, better static guarantees. +- **Cons:** highest immediate migration cost, new operational tooling, delayed delivery of core money-movement correctness. + +### Kotlin/Java + +- **Pros:** strong enterprise ecosystem, mature auth/security libraries. +- **Cons:** heavier stack and slower delivery for a small team. + +### Rust + +- **Pros:** high correctness potential. +- **Cons:** steep delivery cost and limited team familiarity. + +### Keep TypeScript (selected) + +- **Pros:** existing team velocity, reduced migration risk, direct integration with current deployment and frontend contracts. +- **Cons:** npm supply-chain risk remains; mitigated by [[Secure Build and Supply-Chain Policy]] and strict dependency policy. + +## 5. Migration and rollout plan + +1. **Phase A (this quarter):** lock down high-risk flows in TypeScript (ledger, adapter, webhook, auth/session, runbooks). +2. **Phase B (next two quarters):** extract core services behind stable interfaces and add adapter-level contract tests. +3. **Phase C (deferred):** evaluate Go/Kotlin pilot for payout+webhook worker only if: + - Phase A and B are stable for 60 days, + - team staffing supports dual-stack operations, + - audit requirements demand lower runtime dependency exposure. + +## 6. Non-goals + +- Full frontend rewrite. +- New language migration without closed-loop reconciliation and signed-state invariants. +- New provider support that bypasses the adapter contract. + +## 7. Rollback criteria + +- Any increase in incident rate above baseline +20% for 24h after migration activity. +- Any unresolved ledger invariant violation (held + disputed + released + refunded mismatch). +- Any provider outage recovery that requires non-operator-tuned workarounds. + +Rollback to prior TS implementation: + +- disable any split deployment feature flag, +- switch `PAYMENT_ENABLED_PROVIDERS` back to legacy-only, +- freeze new provider routing until incident review is complete, +- complete post-incident update in this ADR. + +## 8. Ownership + +- **CTO:** final stack decision + dual-stack approvals. +- **Backend Lead (BL):** contract and adapter enforcement. +- **Security Lead (SL):** webhook/security acceptance criteria. +- **DevOps Lead (DL):** deployment safety and rollback testing. + +## Related + +- [[Backend Stack Security and Refactor Assessment - 2026-05-24]] +- [[Secure Build and Supply-Chain Policy]] +- [[Backend Funds Migration and Operational Runbooks]] diff --git a/09 - Audits/Backend Stack Security and Refactor Assessment - 2026-05-24.md b/09 - Audits/Backend Stack Security and Refactor Assessment - 2026-05-24.md index ae5595a..b823f19 100644 --- a/09 - Audits/Backend Stack Security and Refactor Assessment - 2026-05-24.md +++ b/09 - Audits/Backend Stack Security and Refactor Assessment - 2026-05-24.md @@ -345,7 +345,7 @@ Should include: - Trust boundaries: browser, backend, database, Redis, provider APIs, wallet/RPC, admin UI, Socket.IO. - Abuse cases: fake payment proof, replayed webhook, arbitrary room join, stolen token, double payout, dispute bypass, email/AI abuse. -### 2. Funds Ledger Specification +### 2. Funds Ledger and Escrow State Machine Specification Purpose: make money movement auditable and provider-independent. @@ -386,9 +386,7 @@ Should map every endpoint and socket event to: ### 5. Payment Provider Adapter Spec -Purpose: decouple business logic from SHKeeper, Request Network, manual wallet flow, and future providers. - -Should define: +Implemented as [[Payment Provider Adapter Spec]], including: - `createPayInIntent` - `getPayInStatus` @@ -399,13 +397,11 @@ Should define: - `getPayoutStatus` - `searchProviderPayments` -Provider-specific metadata should be namespaced and never become the canonical funds state. +Provider-specific metadata is namespaced and never used as canonical funds state. ### 6. Webhook Security Spec -Purpose: prevent forged, replayed, or silently failed provider events. - -Should define: +Implemented as [[Webhook Security Spec]]: - Raw-body signature verification. - Accepted headers and algorithms. @@ -434,6 +430,8 @@ Should define: ### 8. Realtime Authorization Spec +Implemented as [[Realtime Authorization Spec]]. + Purpose: make Socket.IO events subject to the same security model as REST. Should define: @@ -476,9 +474,7 @@ Should define: ### 11. Operational Runbooks -Purpose: make security incidents and payment failures survivable. - -Should include: +Implemented as [[Backend Funds Migration and Operational Runbooks]]: - Failed webhook. - Duplicate payment. diff --git a/09 - Audits/Payment Provider Adapter Spec.md b/09 - Audits/Payment Provider Adapter Spec.md new file mode 100644 index 0000000..d9c83e5 --- /dev/null +++ b/09 - Audits/Payment Provider Adapter Spec.md @@ -0,0 +1,173 @@ +--- +title: Payment Provider Adapter Spec +tags: [adapters, payments, specification, architecture] +created: 2026-05-24 +status: advisory +reviewers: [backend, security, product] +--- + +# Payment Provider Adapter Spec + +This specification standardizes how payment providers are plugged in so platform logic +does not depend on SHKeeper or any single webhook implementation. + +The contract below replaces provider-specific branching in domain services and should +be used by all pay-in, payout, release, and reconciliation logic. + +> Canonical implementation note: this is an advisory ADR for tasks in `task 4.6`. +> It maps to [[Funds Ledger and Escrow State Machine Specification]] and [[Webhook Security Spec]]. + +## 1. Core provider contract + +Implementations expose one typed adapter: + +```ts +interface PaymentProviderAdapter { + readonly provider: "shkeeper" | "request_network" | "manual_wallet" | "admin_wallet" | string; + + createPayInIntent(input: PayInIntentInput): Promise; + getPayInStatus(input: PayInStatusInput): Promise; + handleProviderWebhook(input: ProviderWebhookInput): Promise; + createHostedPaymentLink(input: HostedLinkInput): Promise; + createReleaseInstruction(input: ReleaseInstructionInput): Promise; + createRefundInstruction(input: RefundInstructionInput): Promise; + getPayoutStatus(input: PayoutStatusInput): Promise; + searchProviderPayments(input: ProviderSearchInput): Promise; +} +``` + +All adapters must return a normalized result shape: + +```ts +type NormalizedProviderStatus = "pending" | "processing" | "confirmed" | "completed" | "failed" | "cancelled" | "released" | "refunded"; +type NormalizedProviderEvent = { + providerPaymentId: string; + purchaseRequestId?: string; + requestId?: string; + providerReference?: string; + amount: string; // decimal string + currency: string; + status: NormalizedProviderStatus; + transactionHash?: string; + providerEventType: string; + receivedAt: string; // ISO timestamp + rawFingerprint: string; // provider payload hash +}; +``` + +## 2. Method semantics + +### 2.1 `createPayInIntent` + +- Create a provider-specific payment intent from a canonical request. +- Must return: + - `providerPaymentId` (source of truth for future reconciliation), + - canonical `status`, + - `payInUrl` when redirect/payment-page flow is used, + - an expiry timestamp. +- Must persist provider metadata under `payment.providerData.`. + +### 2.2 `getPayInStatus` + +- Query provider status for an existing intent. +- Must map provider statuses into `NormalizedProviderStatus` and include a provider-specific raw snapshot. +- Must be idempotent and side-effect free. + +### 2.3 `handleProviderWebhook` + +- Input must include raw body bytes, headers, provider identifier, and parsed envelope. +- Must verify signatures before parsing business fields. +- On success, emit canonical domain events and return an idempotency decision: + - `processed` for first apply, + - `duplicate` for replay, + - `ignored` for unknown payment / no-op transitions. + +### 2.4 `createHostedPaymentLink` + +- Return the user-visible payment URL + optional redirect/callback endpoints. +- Should support provider aliases (for migration aliasing, e.g., `request-network` vs `request_network`). + +### 2.5 `createReleaseInstruction` and `createRefundInstruction` + +- Produce signed/payload instructions and pre-check: + - account/release eligibility, + - dispute hold not active, + - sufficient releasable balance (ledger-derived), + - admin approval requirements if configured. +- Must never directly mutate release state. +- Must be idempotent by `(paymentId, actionType)` where action type is `release|refund`. + +### 2.6 `getPayoutStatus` + +- Return state of pending/processing payout tasks and chain/on-chain confirmation status. +- Return normalized status to domain services: + - `processing` for queued/broadcast not-finalized, + - `completed` for finalized payment, + - `failed` with provider error code when rejected. + +### 2.7 `searchProviderPayments` + +- Used for reconciliation and manual verification. +- Must support: + - `providerPaymentId`/`requestId` lookup, + - time-window pagination, + - optional min/max amount filtering. +- Must never be the primary source for state transitions without reconciliation checks. + +## 3. Routing and selection + +Provider selection follows environment-configured capability flags: + +- `PAYMENT_ENABLED_PROVIDERS` (comma-separated allowlist), +- `PAYMENT_DEFAULT_PROVIDER` (read-first fallback), +- `PAYMENT_ROLLBACK_PROVIDER` (read-only fallback target for cutbacks), +- `PAYMENT_MODE`: + - `standard`: normal provider routing, + - `dry_run`: no writes, status-only, + - `read_only`: no new pay-in/intent writes. + +Selection rules: + +1. Validate provider support and provider license/credential validity. +2. Route legacy requests to `shkeeper` when explicit migration window is active. +3. For unknown `provider`, return a `400 Bad Request` with explicit operator-visible error code. +4. If requested provider is disabled, return `409` with migration explanation and owner-visible hint for operator override. + +## 4. Canonical metadata contract + +Payment documents keep provider-specific data namespaced under: + +- `metadata.providers..rawPayload` +- `metadata.providers..rawEvents[]` +- `metadata.providers..providerPaymentId` +- `metadata.providers..lastWebhookAt` + +Domain services must never read `metadata.providers.*` as mutable funds state. They must use ledger-derived balances and canonical status fields only. + +## 5. Error contract + +All adapter methods return standard failure modes: + +- `retryable: true` for transient provider errors (timeouts, 5xx, queue backpressure). +- `retryable: false` for invalid payloads, invalid signatures, and authorization failures. +- `errorCode` must be stable across retries for auditability. + +## 6. Test coverage required + +- Contract tests per adapter: + - `createPayInIntent`, status polling, webhook handling + - invalid/absent signature behavior + - duplicate webhook idempotency + - unknown payment reference behavior + - rollback selection and read-only mode behavior. +- Reconciliation tests: + - provider backfill for missing payment references, + - status drift correction, + - duplicate/missing event merge. + +## Related + +- [[Webhook Security Spec]] +- [[Funds Ledger and Escrow State Machine Specification]] +- [[Backend Core Stack Decision Record - 2026-05-24]] +- [[Backend Funds Migration and Operational Runbooks]] diff --git a/09 - Audits/Realtime Authorization Spec.md b/09 - Audits/Realtime Authorization Spec.md new file mode 100644 index 0000000..a9a8f51 --- /dev/null +++ b/09 - Audits/Realtime Authorization Spec.md @@ -0,0 +1,153 @@ +--- +title: Realtime Authorization Spec +tags: [security, realtime, socketio, authorization] +created: 2026-05-24 +status: advisory +--- + +# Realtime Authorization Spec + +This document defines the target authorization model for Socket.IO events in the +escrow platform. It closes Taskmaster subtask 4.4 alongside +[[Authorization Matrix - REST and Socket.IO]]. + +## 1. Decision + +Socket.IO must use the same trust boundary as REST: + +- every socket connection is authenticated during the handshake, +- room membership is derived by the server, +- clients cannot subscribe to rooms by supplying arbitrary user, request, chat, + seller, or buyer IDs, +- server-to-client emissions are targeted to authorized rooms only, +- sensitive payment, payout, dispute, delivery-code, and chat payloads are never + sent through global broadcasts. + +## 2. Handshake Authentication + +Client connects with an access token in `handshake.auth.token`. + +Server requirements: + +1. verify the JWT signature and standard claims, +2. reject expired, malformed, revoked, or missing tokens, +3. attach `{ userId, roles, sessionId, jti }` to `socket.data`, +4. disconnect immediately when authentication fails, +5. log authentication failures without recording token values. + +Refresh tokens are not accepted by Socket.IO. Clients must refresh through REST +and reconnect with a fresh access token. + +## 3. Server-Derived Base Rooms + +On successful connection the server may join only rooms derivable from the +authenticated principal: + +| Room | Eligibility | Source | +|---|---|---| +| `user-{userId}` | authenticated user | JWT subject | +| `seller-{userId}` | authenticated user with seller role | JWT roles or user record | +| `buyer-{userId}` | authenticated user with buyer role | JWT roles or user record | +| `sellers` | authenticated seller | JWT roles or user record | +| `buyers` | authenticated buyer | JWT roles or user record | + +Clients must not provide `userId`, `sellerId`, or `buyerId` to join these rooms. + +## 4. Resource Rooms + +Resource rooms require database authorization before join. + +| Room | Eligibility | Authorization Query | +|---|---|---| +| `request-{requestId}` | buyer, selected seller, assigned admin/moderator | purchase request participant check | +| `chat-{chatId}` | chat participant or assigned support/admin user | chat participant check | +| `dispute-{disputeId}` | dispute party, assigned moderator, admin | dispute participant/assignment check | +| `template-checkout-{checkoutId}` | checkout owner or service-controlled UI session | checkout ownership check | + +Membership must be rechecked when ownership or state changes. If a request, +chat, or dispute loses a participant, the server must remove that user's sockets +from the associated room. + +## 5. Client Event Policy + +Allowed client-originated events: + +| Event | Required Authorization | Notes | +|---|---|---| +| `join-request-room` | participant check | May remain only as a request for server validation. | +| `leave-request-room` | current membership | User may leave an allowed room. | +| `join-chat-room` | participant check | May remain only as a request for server validation. | +| `leave-chat-room` | current membership | User may leave an allowed room. | +| `typing-start` / `typing-stop` | current `chat-{chatId}` membership | `userId` in payload is ignored; server derives sender. | + +Removed or deprecated client-originated events: + +| Event | Replacement | +|---|---| +| `join-user-room` | server auto-join on handshake | +| `join-seller-room` / `join-buyer-room` | server auto-join from authenticated role | +| `user-online` | server emits presence after authenticated connection | + +## 6. Emission Policy + +Server emissions must target the narrowest authorized room: + +| Data Class | Allowed Target | +|---|---| +| user notifications | `user-{recipientId}` | +| buyer/seller offer updates | relevant `user-*`, `buyer-*`, `seller-*`, or `request-*` room | +| payment status | buyer and seller user rooms, request room if both parties may see it | +| payout status | seller user room and admin operations room only | +| delivery code | seller user room only | +| chat messages | `chat-{chatId}` | +| dispute events | `dispute-{disputeId}` and assigned admin/moderator room | + +Global payment and payout events are prohibited because they expose financial +metadata to unrelated users. + +## 7. Payload Rules + +- Never trust `userId`, `role`, `sellerId`, or `buyerId` from socket payloads. +- Derive sender identity from `socket.data.userId`. +- Do not emit delivery verification codes to buyer-visible rooms. +- Redact wallet addresses, tx hashes, and provider references unless the target + user is a party to the transaction or an authorized operator. +- Keep payload schemas consistent with REST read permissions. + +## 8. Rate Limiting and Audit + +Socket event rate limits: + +| Event Class | Limit | +|---|---| +| room join attempts | 30 per 15 minutes per user | +| typing events | 120 per minute per socket | +| chat message events | same policy as REST chat message creation | +| failed authorization checks | 10 per 15 minutes per user, then disconnect | + +Audit log required for: + +- failed room authorization checks, +- admin/moderator joins to dispute or request rooms, +- attempts to join user/seller/buyer rooms for another principal, +- global payment or payout emission rejection. + +## 9. Tests + +Minimum verification before launch: + +1. invalid or missing JWT cannot connect, +2. user cannot join another user's `user-*`, `seller-*`, or `buyer-*` room, +3. user cannot join a request/chat/dispute room without participant status, +4. removed participant is evicted from the resource room, +5. payment and payout events are not emitted globally, +6. delivery code is emitted only to the seller, +7. socket event rate limits disconnect abusive clients, +8. audit events are written for denied room joins. + +## Related + +- [[Authorization Matrix - REST and Socket.IO]] +- [[Threat Model - Amanat Escrow Platform]] +- [[Session and Authentication Architecture Decision]] +- [[Backend Stack Security and Refactor Assessment - 2026-05-24]] diff --git a/09 - Audits/Session and Authentication Architecture Decision.md b/09 - Audits/Session and Authentication Architecture Decision.md index 0d3431f..33f7917 100644 --- a/09 - Audits/Session and Authentication Architecture Decision.md +++ b/09 - Audits/Session and Authentication Architecture Decision.md @@ -350,6 +350,16 @@ High-risk admin actions require re-authentication. Upon successful re-authentica 8. Frontend retries the original high-risk action. 9. The action proceeds. +### Traceability to Authorization Matrix + +This matrix maps to: + +- `AUTH-R025` (`POST /api/auth/step-up`) for the step-up API entry point. +- `AUTH-R026` (`GET /api/auth/sessions`), `AUTH-R027` (`POST /api/auth/revoke-session`), `AUTH-R028` (`POST /api/auth/revoke-all-sessions`) for session controls. +- `APV-R001`, `APV-R002`, `APV-R003` for approval queue + confirm/reject workflow. + +Status: these rows are marked **Not implemented** in the matrix while this ADR remains in planning/rollout state. + ### Two-person approval flow For actions requiring two-person approval: @@ -659,19 +669,19 @@ If any migration step causes issues: | Threat | Document | |---|---| -| T01 (fake payment proof) | [[Payment Provider Adapter Spec]] (future) | -| T02 (webhook replay) | [[Webhook Security Spec]] (future) | -| T03 (arbitrary socket room join) | Realtime Authorization Spec (future) | -| T05 (double payout) | [[Funds Ledger Specification]] (future) | -| T06 (dispute bypass) | Escrow State Machine (future) | +| T01 (fake payment proof) | [[Funds Ledger and Escrow State Machine Specification]], [[Payment Provider Adapter Spec]] | +| T02 (webhook replay) | [[Webhook Security Spec]] | +| T03 (arbitrary socket room join) | [[Realtime Authorization Spec]] | +| T05 (double payout) | [[Funds Ledger and Escrow State Machine Specification]] | +| T06 (dispute bypass) | [[Funds Ledger and Escrow State Machine Specification]] | | T07 (email abuse) | Rate limiting implementation | | T08 (AI cost abuse) | Rate limiting + auth implementation | -| T09 (admin privilege escalation) | [[Authorization Matrix]] + step-up auth (this ADR) | +| T09 (admin privilege escalation) | [[Authorization Matrix - REST and Socket.IO]] + step-up auth (this ADR) | | T11 (unauthenticated payment endpoints) | Auth middleware implementation | | T12 (rate limit bypass) | Rate limiting implementation | | T14 (supply-chain) | [[Secure Build and Supply-Chain Policy]] | | T16 (deep-link tampering) | Telegram initData verification | -| T17 (provider outage) | Operational runbooks | +| T17 (provider outage) | [[Backend Funds Migration and Operational Runbooks]] | | T18 (insider manipulation) | Multi-sig wallet + funds ledger + two-person approval (this ADR) | | T19 (price manipulation) | Offer status enforcement | | T20 (delivery brute force) | Rate limiting + code entropy | diff --git a/09 - Audits/Task 4 Backend Security Architecture Verification Report.md b/09 - Audits/Task 4 Backend Security Architecture Verification Report.md new file mode 100644 index 0000000..eb026b5 --- /dev/null +++ b/09 - Audits/Task 4 Backend Security Architecture Verification Report.md @@ -0,0 +1,106 @@ +--- +title: Task 4 Backend Security Architecture Verification Report +tags: [taskmaster, verification, security, backend] +created: 2026-05-24 +status: complete +--- + +# Task 4 Backend Security Architecture Verification Report + +Taskmaster task 4 is complete as an advisory architecture and handoff package. +The task defines how the backend security/refactor assessment is converted into +implementation criteria without rewriting or disrupting the current backend +model. + +## 1. Deliverable map + +| Taskmaster item | Deliverable | +|---|---| +| 4.1 Security ownership and launch criteria | [[Security Ownership and Launch Decision Criteria]] | +| 4.2 Escrow platform threat model | [[Threat Model - Amanat Escrow Platform]] | +| 4.3 Funds ledger and escrow state machine | [[Funds Ledger and Escrow State Machine Specification]] | +| 4.4 REST and Socket.IO authorization matrix | [[Authorization Matrix - REST and Socket.IO]], [[Realtime Authorization Spec]] | +| 4.5 Session, passkey, and admin step-up architecture | [[Session and Authentication Architecture Decision]] | +| 4.6 Webhook security and payment adapter contracts | [[Webhook Security Spec]], [[Payment Provider Adapter Spec]] | +| 4.7 Secure build and supply-chain policy | [[Secure Build and Supply-Chain Policy]] | +| 4.8 Backend-core stack decision | [[Backend Core Stack Decision Record - 2026-05-24]] | +| 4.9 Migration and operational runbooks | [[Backend Funds Migration and Operational Runbooks]] | + +## 2. Architecture decisions verified + +- The current TypeScript/Node backend remains the production delivery path for + the next security-hardening phase. +- A full backend rewrite is explicitly out of scope until ledger, webhook, + provider, auth/session, and reconciliation contracts are stable and observable. +- Payment providers are optional and provider-neutral behind adapter contracts. +- Webhooks must use raw-body signature verification, replay prevention, + idempotency, and dead-letter capture. +- Funds movement must be derived from the canonical ledger and escrow state + machine, not provider metadata. +- Admin release, refund, payout, role, and destructive account operations require + step-up authentication and audit logging; high-risk payouts require + two-person approval. +- Socket.IO room membership must be server-derived and authorization checked, + with global financial broadcasts prohibited. + +## 3. Verification commands + +Executed from `nick-doc` on 2026-05-24: + +```bash +npx task-master show 4 +npx task-master set-status --id=4.6 --status=done +npx task-master set-status --id=4.7 --status=done +npx task-master set-status --id=4.8 --status=done +npx task-master set-status --id=4.9 --status=done +npx task-master set-status --id=4.3 --status=done +npx task-master set-status --id=4.4 --status=done +npx task-master set-status --id=4.5 --status=done +npx task-master set-status --id=4 --status=done +node - <<'NODE' +const fs=require('fs'); +const data=JSON.parse(fs.readFileSync('.taskmaster/tasks/tasks.json','utf8')); +const t=data.master.tasks.find(x=>String(x.id)==='4'); +console.log(JSON.stringify({ + task:t.status, + subtasks:t.subtasks.map(s=>({id:s.id,status:s.status,title:s.title})) +}, null, 2)); +NODE +``` + +A one-off Node link checker also parsed Task 4 wiki links and verified they +resolve to markdown files; threat IDs such as `[[T05]]` were treated as allowed +shorthand references. + +## 4. Verification result + +- Taskmaster JSON reports task 4 as `done`. +- Taskmaster JSON reports subtasks 4.1 through 4.9 as `done`. +- `Authorization Matrix - REST and Socket.IO` now links directly to [[Realtime Authorization Spec]]. +- Task 4 wiki links resolve to existing markdown files, excluding threat-ID + shorthand references such as `[[T05]]`. +- Incident ownership in the task 4 runbook was replaced with explicit role owners + that can be mapped to named responders before production launch. +- Remaining implementation tests belong to follow-up backend tasks because task + 4 is a documentation and architecture handoff task. + +## 5. Follow-up implementation test requirements + +Implementation tasks derived from task 4 must include: + +- ledger invariant unit tests for every escrow transition, +- payment provider adapter contract tests for SHKeeper, Request Network, manual + wallet, and disabled-provider modes, +- webhook signature, replay, duplicate, and DLQ tests, +- REST authorization tests for every gap listed in the authorization matrix, +- Socket.IO handshake, room authorization, targeted emission, and rate-limit + tests, +- session rotation, revocation, passkey disabled/enabled, and admin step-up + tests, +- runbook drills for provider outage, leaked webhook secret, stuck release, + suspicious payment proof, and compromised admin. + +## 6. Residual risk + +This report verifies the Task 4 architecture package, not production behavior. +Backend implementation work must still enforce these controls before launch. diff --git a/09 - Audits/Threat Model - Amanat Escrow Platform.md b/09 - Audits/Threat Model - Amanat Escrow Platform.md index ebe9b3e..ca30755 100644 --- a/09 - Audits/Threat Model - Amanat Escrow Platform.md +++ b/09 - Audits/Threat Model - Amanat Escrow Platform.md @@ -557,15 +557,15 @@ The following remediation documents (from the recommended documentation set in [ | Remediation Document | Threats Addressed | |---|---| -| Funds Ledger Specification | T05, T18, T23 | -| Escrow State Machine | T06, T19, T23 | +| Funds Ledger and Escrow State Machine Specification | T05, T18, T23 | +| Funds Ledger and Escrow State Machine Specification | T06, T19, T23 | | Authorization Matrix | T09, T21 | | Webhook Security Spec | T02 | | Session and Auth Architecture | T04, T10, T13, T22 | | Realtime Authorization Spec | T03 | | Payment Provider Adapter Spec | T01, T11, T17 | | Secure Build and Supply-Chain Policy | T14 | -| Operational Runbooks | T17 | +| Backend Funds Migration and Operational Runbooks | T17 | --- diff --git a/09 - Audits/Webhook Security Spec.md b/09 - Audits/Webhook Security Spec.md new file mode 100644 index 0000000..bfb77de --- /dev/null +++ b/09 - Audits/Webhook Security Spec.md @@ -0,0 +1,150 @@ +--- +title: Webhook Security Spec +tags: [webhooks, security, audit, payments] +created: 2026-05-24 +status: advisory +reviewers: [backend, security, operations] +--- + +# Webhook Security Spec + +This document defines signed callback handling for all payment and payout providers. +It closes the gaps in [[Security Architecture]] by turning webhook behavior into an explicit, +auditable contract. + +The scope is inbound callbacks only: + +- SHKeeper pay-in (`/api/payment/shkeeper/webhook`) +- SHKeeper payout (`/api/payment/shkeeper/payout/webhook`) +- Request Network (`/api/payment/request-network/webhook`) +- Manual/admin reconciliation channels (where applicable) + +## 1. Canonical event envelope + +All callbacks are normalized by [[Payment Provider Adapter Spec]] into: + +```ts +type ProviderCallback = { + provider: "shkeeper" | "request_network" | "manual_wallet" | "admin_wallet" | string; + providerPaymentId: string; + purchaseRequestId?: string; + requestId?: string; + deliveryId?: string; + eventType: string; // e.g., paid, payout_completed, status_update + status: string; // provider-specific raw status + normalizedStatus: "pending" | "completed" | "failed" | "cancelled" | "released" | "refunded"; + amount?: string; + currency?: string; + transactionHash?: string; + occurredAt?: string; // ISO 8601 if provided + receivedAt: string; // server-side receive time + rawFingerprint: string; // sha256(raw_body) +}; +``` + +Callbacks are processed only through adapter entry points; provider-specific parsing remains private to the adapter. + +## 2. Signature verification + +### 2.1 Required mechanics + +- Verify signatures against raw request bytes, **before JSON parsing**. +- Use constant-time comparison and short-circuit to 401/403 on mismatch. +- Never disable verification outside local-only test tooling. +- Store raw payload hash (`rawFingerprint`) for forensics and idempotency checks. + +### 2.2 Provider headers + +| Provider | Header(s) | +|---|---| +| SHKeeper | `x-shkeeper-signature` | +| Request Network | `x-request-network-signature` | +| Test override (local only) | explicitly documented in deployment notes, never in production | + +If expected signature header is absent or malformed, treat as a non-retryable client error. + +## 3. Replay prevention and idempotency + +For each callback store and enforce one of: + +- `deliveryId` + `provider` + `eventType`, or +- `(providerPaymentId, normalizedStatus, provider)` when provider has no delivery id. + +Replay rules: + +- First successful write path = **processed**. +- Same key seen again with no state change = **duplicate** (HTTP 200 response, no side effects). +- Same key seen for different payload hash = **conflict** (HTTP 409, captured to DLQ). + +## 4. Unknown and duplicate behavior + +| Condition | Response | Side effects | +|---|---|---| +| Signature valid, unknown `providerPaymentId` | `200` (`unknown_payment`) in v1 mode / `404` in strict mode | no state write, record DLQ entry for operator review | +| Known `providerPaymentId`, already terminal | `200` (`duplicate_terminal`) | no state write | +| Known `providerPaymentId`, stale status transition | `200` (`duplicate_or_out_of_order`) | no state write | +| Unknown signature | `401` | no state write | +| Malformed payload | `400` | no state write | + +## 5. Retry semantics + +- Callback consumers (providers) may retry: + - transient network failures, + - 5xx/provider internal timeouts, + - explicit retryable status from endpoint. +- Retry is triggered only on non-2xx codes for SHKeeper and Request Network. +- Recommended handler mapping: + - `401/400` = do not retry (hard fail), + - `409` = do not retry until manual release, + - `500/503` = retry. + +## 6. Dead-letter and replay storage + +Persist all failed callbacks for at least 7 days in append-only storage: + +- `providerWebhookFailures` +- key fields: `provider`, `deliveryId`, `providerPaymentId`, `requestPath`, `requestHeaders`, `rawFingerprint`, `statusCode`, `errorCode`, `attemptCount`, `nextRetryAt`, `rawBodyRef`, `createdAt`. +- If storage is unavailable, fail closed and raise a high-severity ops alert. + +Retention policy: + +- 30 days for `success==true`, +- 180 days for `unknown_payment`, `repeated_conflict`, `signature_failure`, +- immediate alert if retry queue exceeds 500 entries for a provider. + +## 7. Alerting thresholds + +- `failed_webhook_count` over 1 minute: + - warning at `> 20`, + - critical at `> 100`. +- signature failures: + - warning at `> 5` in 5 minutes, + - critical at `> 20` in 5 minutes. +- duplicate ratio: + - warning if `duplicates / total >= 0.15` for 10 minutes. +- dead-letter growth: + - warning at `+200` new entries/hour, + - critical at `+500`/hour. + +## 8. Required operator signals + +Webhook health checks should expose: + +- last-seen timestamp by provider, +- delivery backlog depth, +- per-status counters (`processed`, `duplicate`, `unknown`, `conflict`, `signature_failure`), +- DLQ length and oldest entry age. + +## 9. Testing requirements + +- Signature bypass tests (must remain false in staging/prod), +- replay/delivery-id duplicate tests, +- malformed payload tests, +- unknown payment tests, +- non-terminal duplicate suppression tests. + +## Related + +- [[Payment Provider Adapter Spec]] +- [[Error Codes]] +- [[Backend Funds Migration and Operational Runbooks]] diff --git a/Taskmaster/Tasks/task-4-3.md b/Taskmaster/Tasks/task-4-3.md index 8f469bd..bb0fe7e 100644 --- a/Taskmaster/Tasks/task-4-3.md +++ b/Taskmaster/Tasks/task-4-3.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.3" -status: "pending" +status: "done" priority: "high" depends_on: ["2"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.3 - Specify funds ledger and escrow state machine -- [ ] 4.3 - Specify funds ledger and escrow state machine #taskmaster #priority/high #status/pending ⏫ πŸ†” tm-4-3 β›” tm-2 +- [x] 4.3 - Specify funds ledger and escrow state machine #taskmaster #priority/high #status/done ⏫ πŸ†” tm-4-3 β›” tm-2 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.3 | -| Status | pending | +| Status | done | | Priority | high | | Dependencies | 2 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Define canonical money movement and legal state transitions before refactor or p ## Details +Completed. Produced `09 - Audits/Funds Ledger and Escrow State Machine Specification.md` (states, transitions, invariants, and migration guidance for canonical funds/escrow transitions). + Create specs for FundsAccount, LedgerEntry, FundsBalance, gross paid, provider fees, platform fees, held, disputed, releasable, released, refunded, idempotency keys, reconciliation behavior, purchase request states, payment states, escrow/funds states, dispute states, valid transitions, forbidden transitions, and release/refund/admin override preconditions. ## Verification diff --git a/Taskmaster/Tasks/task-4-4.md b/Taskmaster/Tasks/task-4-4.md index 4415ff5..c2b5e48 100644 --- a/Taskmaster/Tasks/task-4-4.md +++ b/Taskmaster/Tasks/task-4-4.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.4" -status: "pending" +status: "done" priority: "high" depends_on: ["2"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.4 - Create authorization matrix for REST and Socket.IO -- [ ] 4.4 - Create authorization matrix for REST and Socket.IO #taskmaster #priority/high #status/pending ⏫ πŸ†” tm-4-4 β›” tm-2 +- [x] 4.4 - Create authorization matrix for REST and Socket.IO #taskmaster #priority/high #status/done ⏫ πŸ†” tm-4-4 β›” tm-2 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.4 | -| Status | pending | +| Status | done | | Priority | high | | Dependencies | 2 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Map every endpoint and realtime event to access level, ownership checks, state p ## Details +Completed. Produced `09 - Audits/Authorization Matrix - REST and Socket.IO.md` and `09 - Audits/Realtime Authorization Spec.md`. + Include public/authenticated/owner/buyer/seller/admin/support/service-role classifications. Socket.IO rooms must be server-derived from authenticated identity, not client-supplied user IDs. ## Verification diff --git a/Taskmaster/Tasks/task-4-5.md b/Taskmaster/Tasks/task-4-5.md index 0ceade9..c55397b 100644 --- a/Taskmaster/Tasks/task-4-5.md +++ b/Taskmaster/Tasks/task-4-5.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.5" -status: "pending" +status: "done" priority: "high" depends_on: ["2"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.5 - Decide session, passkey, and admin step-up architecture -- [ ] 4.5 - Decide session, passkey, and admin step-up architecture #taskmaster #priority/high #status/pending ⏫ πŸ†” tm-4-5 β›” tm-2 +- [x] 4.5 - Decide session, passkey, and admin step-up architecture #taskmaster #priority/high #status/done ⏫ πŸ†” tm-4-5 β›” tm-2 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.5 | -| Status | pending | +| Status | done | | Priority | high | | Dependencies | 2 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Choose browser session model and high-risk admin authentication requirements. ## Details +Completed. Produced `09 - Audits/Session and Authentication Architecture Decision.md`. + Decide localStorage versus httpOnly cookies, access/refresh token lifetimes, CSRF strategy, refresh rotation, WebAuthn requirements, OAuth requirements, device/session revocation, and whether payouts/role changes require step-up authentication or two-person approval. ## Verification diff --git a/Taskmaster/Tasks/task-4-6.md b/Taskmaster/Tasks/task-4-6.md index 6b6f581..ec93da4 100644 --- a/Taskmaster/Tasks/task-4-6.md +++ b/Taskmaster/Tasks/task-4-6.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.6" -status: "pending" +status: "done" priority: "high" depends_on: ["3"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.6 - Specify webhook security and provider adapter contracts -- [ ] 4.6 - Specify webhook security and provider adapter contracts #taskmaster #priority/high #status/pending ⏫ πŸ†” tm-4-6 β›” tm-3 +- [x] 4.6 - Specify webhook security and provider adapter contracts #taskmaster #priority/high #status/done ⏫ πŸ†” tm-4-6 β›” tm-3 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.6 | -| Status | pending | +| Status | done | | Priority | high | | Dependencies | 3 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Define provider-neutral payment interface and signed webhook processing rules. ## Details +Completed. Produced `09 - Audits/Webhook Security Spec.md` and `09 - Audits/Payment Provider Adapter Spec.md`. + Document createPayInIntent, getPayInStatus, handleProviderWebhook, createHostedPaymentLink, createReleaseInstruction, createRefundInstruction, getPayoutStatus, searchProviderPayments, raw-body signature verification, replay prevention, delivery ID idempotency, duplicate/unknown event behavior, retry semantics, dead-letter/replay storage, and alert thresholds. ## Verification diff --git a/Taskmaster/Tasks/task-4-8.md b/Taskmaster/Tasks/task-4-8.md index 7f4cbd5..948e31c 100644 --- a/Taskmaster/Tasks/task-4-8.md +++ b/Taskmaster/Tasks/task-4-8.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.8" -status: "pending" +status: "done" priority: "medium" depends_on: ["2", "3", "4", "5", "6", "7"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.8 - Make backend-core stack decision -- [ ] 4.8 - Make backend-core stack decision #taskmaster #priority/medium #status/pending πŸ”Ό πŸ†” tm-4-8 β›” tm-2 β›” tm-3 β›” tm-4 β›” tm-5 β›” tm-6 β›” tm-7 +- [x] 4.8 - Make backend-core stack decision #taskmaster #priority/medium #status/done πŸ”Ό πŸ†” tm-4-8 β›” tm-2 β›” tm-3 β›” tm-4 β›” tm-5 β›” tm-6 β›” tm-7 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.8 | -| Status | pending | +| Status | done | | Priority | medium | | Dependencies | 2, 3, 4, 5, 6, 7 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Choose whether the security-critical backend core remains TypeScript or moves to ## Details +Completed. Produced `09 - Audits/Backend Core Stack Decision Record - 2026-05-24.md`. + Evaluate team capability, two-year maintainability, operational footprint, rewrite cost, dual-stack complexity, auditability, supply-chain exposure, and which modules belong in a payment/auth/escrow core versus the existing marketplace/chat API. ## Verification diff --git a/Taskmaster/Tasks/task-4-9.md b/Taskmaster/Tasks/task-4-9.md index 2842fd5..4b08e1d 100644 --- a/Taskmaster/Tasks/task-4-9.md +++ b/Taskmaster/Tasks/task-4-9.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4.9" -status: "pending" +status: "done" priority: "medium" depends_on: ["8"] parent_id: "4" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4.9 - Create migration and operational runbooks -- [ ] 4.9 - Create migration and operational runbooks #taskmaster #priority/medium #status/pending πŸ”Ό πŸ†” tm-4-9 β›” tm-8 +- [x] 4.9 - Create migration and operational runbooks #taskmaster #priority/medium #status/done πŸ”Ό πŸ†” tm-4-9 β›” tm-8 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4.9 | -| Status | pending | +| Status | done | | Priority | medium | | Dependencies | 8 | | Parent | 4 - Define backend security and refactor strategy from latest audit | @@ -28,6 +28,8 @@ Document rollout, rollback, and incident response for the selected backend/funds ## Details +Completed. Produced `08 - Operations/Backend Funds Migration and Operational Runbooks.md`. + Include SHKeeper legacy read path, provider feature flag, ledger backfill, validation report before enforcement, rollback criteria, webhook cutoff, manual reconciliation, failed webhook, duplicate/missing payment, stuck release, disputed release attempt, compromised admin, leaked API key, provider outage, chain/RPC outage, suspicious payment proof, and npm/package compromise. ## Verification diff --git a/Taskmaster/Tasks/task-4.md b/Taskmaster/Tasks/task-4.md index 8cb77bd..32d60b7 100644 --- a/Taskmaster/Tasks/task-4.md +++ b/Taskmaster/Tasks/task-4.md @@ -1,23 +1,23 @@ --- taskmaster_id: "4" -status: "in-progress" +status: "done" priority: "high" depends_on: [] parent_id: "" source: "taskmaster" -generated_at: "2026-05-24T07:15:25.199Z" +generated_at: "2026-05-24T07:26:29.052Z" --- # 4 - Define backend security and refactor strategy from latest audit -- [ ] 4 - Define backend security and refactor strategy from latest audit #taskmaster #priority/high #status/in-progress ⏫ πŸ†” tm-4 +- [x] 4 - Define backend security and refactor strategy from latest audit #taskmaster #priority/high #status/done ⏫ πŸ†” tm-4 ## Metadata | Field | Value | | --- | --- | | Taskmaster ID | 4 | -| Status | in-progress | +| Status | done | | Priority | high | | Dependencies | None | | Parent | None |