@@ -668,23 +668,46 @@ public function extract_raw_token() {
668668 $ next_at += strlen ( $ raw_token );
669669 $ is_text = '#text ' === $ token_reporter ->get_token_name ();
670670
671- if ( ! $ is_text && ! $ was_text ) {
672- $ tokens [] = '' ;
671+ if ( ! $ is_text ) {
672+ if ( ! $ was_text ) {
673+ $ tokens [] = '' ;
674+ }
675+
676+ $ tokens [] = $ raw_token ;
677+ $ was_text = false ;
678+ continue ;
673679 }
674680
675681 /*
676- * Some legacy code assumes that text nodes will never start with a
677- * less-than sign (<) but this isn’t the case, as some text nodes do
678- * if the less-than sign doesn’t introduce a syntax token. To avoid
679- * further corruption a leading less-than sign is replaced by its
680- * encoded equivalent numeric character reference.
682+ * WordPress looks for shortcodes and escaped shortcodes within the HTML
683+ * where they look like tags but HTML wouldn’t consider them tags, such
684+ * as in "<[header level=2]>". Look for these and artificially split the
685+ * text nodes where it looks like shortcodes reside inside.
681686 */
682- if ( $ is_text && '< ' === ( $ raw_token [0 ] ?? '' ) ) {
683- $ raw_token = '< ' . substr ( $ raw_token , 1 );
684- }
687+ $ shortcode_pattern = get_shortcode_regex ();
688+ $ text_chunks = preg_split ( "~(< {$ shortcode_pattern }>)~ " , $ raw_token , -1 , PREG_SPLIT_DELIM_CAPTURE );
689+ foreach ( $ text_chunks as $ i => $ token ) {
690+ // The preg_split() always puts captured delimiters in the odd indices.
691+ $ is_shortcode_tag = 0x01 === $ i & 0x01 ;
692+
693+ if ( $ is_shortcode_tag && ! $ was_text ) {
694+ $ tokens [] = '' ;
695+ }
685696
686- $ tokens [] = $ raw_token ;
687- $ was_text = $ is_text ;
697+ /*
698+ * Some legacy code assumes that text nodes will never start with a
699+ * less-than sign (<) but this isn’t the case, as some text nodes do
700+ * if the less-than sign doesn’t introduce a syntax token. To avoid
701+ * further corruption a leading less-than sign is replaced by its
702+ * encoded equivalent numeric character reference.
703+ */
704+ if ( ! $ is_shortcode_tag && '< ' === ( $ token [0 ] ?? '' ) ) {
705+ $ token = '< ' . substr ( $ token , 1 );
706+ }
707+
708+ $ was_text = ! $ is_shortcode_tag ;
709+ $ tokens [] = $ token ;
710+ }
688711 }
689712
690713 /*
0 commit comments