Skip to content

Commit c33e78c

Browse files
committed
wp_html_split() match legacy shortcode-tags
1 parent bf6fe75 commit c33e78c

File tree

1 file changed

+35
-12
lines changed

1 file changed

+35
-12
lines changed

src/wp-includes/formatting.php

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -668,23 +668,46 @@ public function extract_raw_token() {
668668
$next_at += strlen( $raw_token );
669669
$is_text = '#text' === $token_reporter->get_token_name();
670670

671-
if ( ! $is_text && ! $was_text ) {
672-
$tokens[] = '';
671+
if ( ! $is_text ) {
672+
if ( ! $was_text ) {
673+
$tokens[] = '';
674+
}
675+
676+
$tokens[] = $raw_token;
677+
$was_text = false;
678+
continue;
673679
}
674680

675681
/*
676-
* Some legacy code assumes that text nodes will never start with a
677-
* less-than sign (<) but this isn’t the case, as some text nodes do
678-
* if the less-than sign doesn’t introduce a syntax token. To avoid
679-
* further corruption a leading less-than sign is replaced by its
680-
* encoded equivalent numeric character reference.
682+
* WordPress looks for shortcodes and escaped shortcodes within the HTML
683+
* where they look like tags but HTML wouldn’t consider them tags, such
684+
* as in "<[header level=2]>". Look for these and artificially split the
685+
* text nodes where it looks like shortcodes reside inside.
681686
*/
682-
if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) {
683-
$raw_token = '&#60;' . substr( $raw_token, 1 );
684-
}
687+
$shortcode_pattern = get_shortcode_regex();
688+
$text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
689+
foreach ( $text_chunks as $i => $token ) {
690+
// The preg_split() always puts captured delimiters in the odd indices.
691+
$is_shortcode_tag = 0x01 === $i & 0x01;
692+
693+
if ( $is_shortcode_tag && ! $was_text ) {
694+
$tokens[] = '';
695+
}
685696

686-
$tokens[] = $raw_token;
687-
$was_text = $is_text;
697+
/*
698+
* Some legacy code assumes that text nodes will never start with a
699+
* less-than sign (<) but this isn’t the case, as some text nodes do
700+
* if the less-than sign doesn’t introduce a syntax token. To avoid
701+
* further corruption a leading less-than sign is replaced by its
702+
* encoded equivalent numeric character reference.
703+
*/
704+
if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
705+
$token = '&#60;' . substr( $token, 1 );
706+
}
707+
708+
$was_text = ! $is_shortcode_tag;
709+
$tokens[] = $token;
710+
}
688711
}
689712

690713
/*

0 commit comments

Comments
 (0)