next.orly.dev/pkg/encoders/text/escape.go

package text

// NostrEscape for JSON encoding according to RFC8259.
//
// This is the efficient implementation based on the NIP-01 specification:
//
// To prevent implementation differences from creating a different event ID for
// the same event, the following rules MUST be followed while serializing:
//
//	No whitespace, line breaks or other unnecessary formatting should be included
//	in the output JSON. No characters except the following should be escaped, and
//	instead should be included verbatim:
//
//	- A line break, 0x0A, as \n
//	- A double quote, 0x22, as \"
//	- A backslash, 0x5C, as \\
//	- A carriage return, 0x0D, as \r
//	- A tab character, 0x09, as \t
//	- A backspace, 0x08, as \b
//	- A form feed, 0x0C, as \f
//
//	UTF-8 should be used for encoding.
//
// NOTE: We also escape all other control characters (0x00-0x1F excluding those above)
// to ensure valid JSON, even though NIP-01 doesn't require it. This prevents
// JSON parsing errors when events with binary data in content are sent to relays.
func NostrEscape(dst, src []byte) []byte {
	l := len(src)
	// Pre-allocate buffer if nil to reduce reallocations
	// Estimate: worst case is all control chars which expand to 6 bytes each (\u00XX)
	// but most strings have few escapes, so estimate len(src) * 1.5 as a safe middle ground
	if dst == nil && l > 0 {
		estimatedSize := l * 3 / 2
		if estimatedSize < l {
			estimatedSize = l
		}
		dst = make([]byte, 0, estimatedSize)
	}
	for i := 0; i < l; i++ {
		c := src[i]
		if c == '"' {
			dst = append(dst, '\\', '"')
		} else if c == '\\' {
			// if i+1 < l && src[i+1] == 'u' || i+1 < l && src[i+1] == '/' {
			if i+1 < l && src[i+1] == 'u' {
				dst = append(dst, '\\')
			} else {
				dst = append(dst, '\\', '\\')
			}
		} else if c == '\b' {
			dst = append(dst, '\\', 'b')
		} else if c == '\t' {
			dst = append(dst, '\\', 't')
		} else if c == '\n' {
			dst = append(dst, '\\', 'n')
		} else if c == '\f' {
			dst = append(dst, '\\', 'f')
		} else if c == '\r' {
			dst = append(dst, '\\', 'r')
		} else if c < 32 {
			// Escape all other control characters (0x00-0x1F except those handled above) as \uXXXX
			// This ensures valid JSON even when content contains binary data
			dst = append(dst, '\\', 'u', '0', '0')
			hexHigh := (c >> 4) & 0x0F
			hexLow := c & 0x0F
			if hexHigh < 10 {
				dst = append(dst, byte('0'+hexHigh))
			} else {
				dst = append(dst, byte('a'+(hexHigh-10)))
			}
			if hexLow < 10 {
				dst = append(dst, byte('0'+hexLow))
			} else {
				dst = append(dst, byte('a'+(hexLow-10)))
			}
		} else {
			dst = append(dst, c)
		}
	}
	return dst
}

// NostrUnescape reverses the operation of NostrEscape except instead of
// appending it to the provided slice, it rewrites it, eliminating a memory
// copy. Keep in mind that the original JSON will be mangled by this operation,
// but the resultant slices will cost zero allocations.
func NostrUnescape(dst []byte) (b []byte) {
	var r, w int
	for ; r < len(dst); r++ {
		if dst[r] == '\\' {
			r++
			c := dst[r]
			switch {

			// nip-01 specifies the following single letter C-style escapes for
			// control codes under 0x20.
			//
			// no others are specified but must be preserved, so only these can
			// be safely decoded at runtime as they must be re-encoded when
			// marshalled.
			case c == '"':
				dst[w] = '"'
				w++
			case c == '\\':
				dst[w] = '\\'
				w++
			case c == 'b':
				dst[w] = '\b'
				w++
			case c == 't':
				dst[w] = '\t'
				w++
			case c == 'n':
				dst[w] = '\n'
				w++
			case c == 'f':
				dst[w] = '\f'
				w++
			case c == 'r':
				dst[w] = '\r'
				w++

			// special cases for non-nip-01 specified json escapes (must be
			// preserved for ID generation).
		case c == 'u':
			// Check if this is a \u0000-\u001F sequence we generated
			if r+4 < len(dst) && dst[r+1] == '0' && dst[r+2] == '0' {
				// Extract hex digits
				hexHigh := dst[r+3]
				hexLow := dst[r+4]

				var val byte
				if hexHigh >= '0' && hexHigh <= '9' {
					val = (hexHigh - '0') << 4
				} else if hexHigh >= 'a' && hexHigh <= 'f' {
					val = (hexHigh - 'a' + 10) << 4
				} else if hexHigh >= 'A' && hexHigh <= 'F' {
					val = (hexHigh - 'A' + 10) << 4
				}

				if hexLow >= '0' && hexLow <= '9' {
					val |= hexLow - '0'
				} else if hexLow >= 'a' && hexLow <= 'f' {
					val |= hexLow - 'a' + 10
				} else if hexLow >= 'A' && hexLow <= 'F' {
					val |= hexLow - 'A' + 10
				}

				// Only decode if it's a control character (0x00-0x1F)
				if val < 32 {
					dst[w] = val
					w++
					r += 4 // Skip the u00XX part
					continue
				}
			}
			// Not our generated \u0000-\u001F, preserve as-is
			dst[w] = '\\'
			w++
			dst[w] = 'u'
			w++
		case c == '/':
				dst[w] = '\\'
				w++
				dst[w] = '/'
				w++

			// special case for octal escapes (must be preserved for ID
			// generation).
			case c >= '0' && c <= '9':
				dst[w] = '\\'
				w++
				dst[w] = c
				w++

				// anything else after a reverse solidus just preserve it.
			default:
				dst[w] = dst[r]
				w++
				dst[w] = c
				w++
			}
		} else {
			dst[w] = dst[r]
			w++
		}
	}
	b = dst[:w]
	return
}