How to process this string by regex in oracle plsql

Question

My string (varchar2): 1111 = 2222;3333 in 4444;5555 sum (6666;7777) I want to replace in my string ";" through " ; " except the ";" in parentheses.

Expected result:

1111 = 2222 ; 3333 in 4444 ; 5555 sum (6666;7777)

How to handle this case with the regex_replace function?

I want my regex to say: target them; and exclude them; in parentheses I know targeting a character is just indicating the character. The syntax for excluding a character is [^character_list] so for me is => [^;]

I tried this: regex_replace (myString, ';([^\(.;.\)]', ';');

But it doesn't work

link for a documentation : https://docs.oracle.com/cd/B12037_01/appdev.101/b10795/adfns_re.htm

Edit: sorry guys I made some mistake in my request. I corrected the description above. I only wish replace ";" trough with " ; " except ";" in parentesis. The regular expression for that in function regex_replace. I already have a function to split after. Sorry for that.

@MT0 That duplicate isn't specific enough. This problem has semicolons which are part of the data, not just the delimiters. — Tim Biegeleisen
– Tim Biegeleisen, Commented Nov 7, 2020 at 14:11
Honestly, you would best handle this problem outside your SQL database. Oracle wasn't built for such complex strings manipulations. And also, it would be better to try to receive and store your data in a normalized format, rather than semicolon-separated as you have now. — Tim Biegeleisen
– Tim Biegeleisen, Commented Nov 7, 2020 at 14:12
@TimBiegeleisen Its the same process, the OP just needs to fix their regular expression so they are not trying to use a non-matching character list [^] to match the braces. — MT0
– MT0, Commented Nov 7, 2020 at 14:34

MT0 · Accepted Answer · 2020-11-07 22:31:57Z

You can use a recursive sub-query factoring clause without any regular expressions and which correctly handles nested braces (such as (abc; (def;fff); zzyzx)):

WITH char_groups (
  id, value, idx, len, grp, grp_start, is_end, brace_level
) AS (
  SELECT id,
         value,
         DECODE( SUBSTR( value, 1, 1 ), ';', 0, 1 ),
         LENGTH( value ),
         1,
         DECODE( SUBSTR( value, 1, 1 ), ';', 0, 1 ),
         CASE
         WHEN ';' IN ( SUBSTR( value, 1, 1 ), SUBSTR( value, 2, 1 ) )
         OR   value IS NULL
         THEN 1
         ELSE 0
         END,
         DECODE( SUBSTR( value, 1, 1 ), '(', 1, 0 )
  FROM   table_name
UNION ALL
  SELECT id,
         value,
         idx + 1,
         len,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN grp + 1
         ELSE grp
         END,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN idx + 1
         ELSE grp_start
         END,
         CASE
         WHEN (
                (
                  ( brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) <> '(' )
                  OR
                  ( brace_level = 1 AND SUBSTR( value, idx + 1, 1 ) = ')' )
                )
                AND
                SUBSTR( value, idx + 2, 1 ) = ';'
              )
         OR   idx + 1 = len
         THEN 1
         ELSE 0
         END,
         GREATEST(
           brace_level + DECODE( SUBSTR( value, idx + 1, 1 ), '(', 1, ')', -1, 0 ),
           0
         )
  FROM   char_groups
  WHERE  idx < len
)
SELECT id,
       grp,
       LTRIM( SUBSTR( value, grp_start, idx - grp_start + 1 ), ';' ) AS term
FROM   char_groups cg
WHERE  is_end = 1
ORDER BY id, grp

Which, for the sample data:

CREATE TABLE table_name ( id, value ) AS
SELECT 1, '1111 = 2222; 3333 in 4444; 5555 sum (6666; 7777)' FROM DUAL UNION ALL
SELECT 2, 'ABCD'                    FROM DUAL UNION ALL
SELECT 3, 'A;;C;D;;F(F;F;F);'       FROM DUAL UNION ALL
SELECT 4, NULL                      FROM DUAL UNION ALL
SELECT 5, 'abc; (e;f) = ( e; f)'    FROM DUAL UNION ALL
SELECT 6, '(abc; (def;fff); zzyzx)' FROM DUAL UNION ALL
SELECT 7, '(((abc)));(def));((ghi;jkl);mno);pqr' FROM DUAL UNION ALL
SELECT 8, ';'                       FROM DUAL UNION ALL
SELECT 9, ';A;'                     FROM DUAL UNION ALL
SELECT 10, '()(;);(;)(;)'           FROM DUAL;

Outputs (including NULL values for when you have two adjacent ; delimiters):

ID | GRP | TERM                   
-: | --: | :----------------------
 1 |   1 | 1111 = 2222            
 1 |   2 |  3333 in 4444          
 1 |   3 |  5555 sum (6666; 7777) 
 2 |   1 | ABCD                   
 3 |   1 | A                      
 3 |   2 | null                   
 3 |   3 | C                      
 3 |   4 | D                      
 3 |   5 | null                   
 3 |   6 | F(F;F;F)               
 3 |   7 | null                   
 4 |   1 | null                   
 5 |   1 | abc                    
 5 |   2 |  (e;f) = ( e; f)       
 6 |   1 | (abc; (def;fff); zzyzx)
 7 |   1 | (((abc)))              
 7 |   2 | (def))                 
 7 |   3 | ((ghi;jkl);mno)        
 7 |   4 | pqr                    
 8 |   1 | null                   
 8 |   2 | null                   
 9 |   1 | null                   
 9 |   2 | A                      
 9 |   3 | null                   
10 |   1 | ()(;)                  
10 |   2 | (;)(;)

db<>fiddle here

Previous Version:

Will work if you have single, balanced parentheses:

WITH terms ( id, value, idx, term, spos, epos ) AS (
  SELECT id,
         value,
         1,
         REGEXP_SUBSTR( value, '(([^;(]|\(.*?\))*)(;|$)', 1, 1, NULL, 1 ),
         1,
         REGEXP_INSTR( value, '(([^;(]|\(.*?\))*)(;|$)', 1, 1, 1 )
  FROM   table_name
UNION ALL
  SELECT id,
         value,
         idx + 1,
         REGEXP_SUBSTR( value, '(([^;(]|\(.*?\))*)(;|$)', epos, 1, NULL, 1 ),
         epos,
         REGEXP_INSTR( value, '(([^;(]|\(.*?\))*)(;|$)', epos, 1, 1 )
  FROM   terms
  WHERE  epos > 0
)
SELECT id, idx, term
FROM   terms
WHERE  SUBSTR( value, -1 ) = ';'
OR     spos < LENGTH( value )
OR     value IS NULL
ORDER BY id, idx;

Which outputs:

ID | IDX | TERM                  
-: | --: | :---------------------
 1 |   1 | 1111 = 2222           
 1 |   2 |  3333 in 4444         
 1 |   3 |  5555 sum (6666; 7777)
 2 |   1 | ABCD                  
 3 |   1 | A                     
 3 |   2 | null                  
 3 |   3 | C                     
 3 |   4 | D                     
 3 |   5 | null                  
 3 |   6 | F(F;F;F)              
 3 |   7 | null                  
 4 |   1 | null                  
 5 |   1 | abc                   
 5 |   2 |  (e;f) = ( e; f)      
 6 |   1 | (abc; (def;fff)       
 6 |   2 |  zzyzx)               
 7 |   1 | (((abc)))             
 7 |   2 | (def))                
 7 |   3 | ((ghi;jkl)            
 7 |   4 | mno)                  
 7 |   5 | pqr                   
 8 |   1 | null                  
 8 |   2 | null                  
 9 |   1 | null                  
 9 |   2 | A                     
 9 |   3 | null                  
10 |   1 | ()(;)                 
10 |   2 | (;)(;)

(Note: it doesn't work for examples 6 & 7 where there are multiple nested parentheses but does work or the simpler test cases.)

db<>fiddle here

Update

I only wish replace ";" trough with " ; " except ";" in parenthesis.

You can use:

WITH char_groups (
  id, value, idx, len, grp, brace_level
) AS (
  SELECT id,
         value,
         CASE SUBSTR( value, 1, 1 ) WHEN ';' THEN 0 ELSE 1 END,
         LENGTH( value ),
         1,
         CASE SUBSTR( value, 1, 1 ) WHEN '(' THEN 1 ELSE 0 END
  FROM   table_name
UNION ALL
  SELECT id,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN SUBSTR( value, 1, idx ) || ' ; ' || SUBSTR( value, idx + 2 )
         ELSE value
         END,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN idx + 3
         ELSE idx + 1
         END,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN len + 2
         ELSE len
         END,
         CASE
         WHEN brace_level = 0 AND SUBSTR( value, idx + 1, 1 ) = ';'
         THEN grp + 1
         ELSE grp
         END,
         GREATEST(
           brace_level
           + CASE SUBSTR( value, idx + 1, 1 )
             WHEN '(' THEN 1
             WHEN ')' THEN -1
             ELSE 0
             END,
           0
         )
  FROM   char_groups
  WHERE  idx < len
)
SELECT id,
       value
FROM   char_groups
WHERE  idx = len OR value IS NULL
ORDER BY id, idx

And, for the sample data:

CREATE TABLE table_name ( id, value ) AS
SELECT 1, '1111 = 2222; 3333 in 4444; 5555 sum (6666; 7777)' FROM DUAL UNION ALL
SELECT 2, 'ABCD'                    FROM DUAL UNION ALL
SELECT 3, 'A;;C;D;;F(F;F;F);'       FROM DUAL UNION ALL
SELECT 4, NULL                      FROM DUAL UNION ALL
SELECT 5, 'abc; (e;f) = ( e; f)'    FROM DUAL UNION ALL
SELECT 6, '(abc; (def;fff); zzyzx)' FROM DUAL UNION ALL
SELECT 7, '(((abc)));(def));((ghi;jkl);mno);pqr' FROM DUAL UNION ALL
SELECT 8, ';'                       FROM DUAL UNION ALL
SELECT 9, ';A;'                     FROM DUAL UNION ALL
SELECT 10, '()(;);(;)(;)'           FROM DUAL UNION ALL
SELECT 11, '(a ; b);(c ; d)'        FROM DUAL;

This outputs:

ID | VALUE                                               
-: | :---------------------------------------------------
 1 | 1111 = 2222 ;  3333 in 4444 ;  5555 sum (6666; 7777)
 2 | ABCD                                                
 3 | A ;  ; C ; D ;  ; F(F;F;F) ;                        
 4 | null                                                
 5 | abc ;  (e;f) = ( e; f)                              
 6 | (abc; (def;fff); zzyzx)                             
 7 | (((abc))) ; (def)) ; ((ghi;jkl);mno) ; pqr          
 8 |  ;                                                  
 9 |  ; A ;                                              
10 | ()(;) ; (;)(;)                                      
11 | (a ; b) ; (c ; d)

Or, for only single nested braces, you can use:

SELECT id,
       RTRIM( REGEXP_REPLACE( value, '(([^;(]|\(.*?\))*)(;|$)', '\1 \3 ' ) )
       || CASE SUBSTR( value, -1 ) WHEN ';' THEN ' ' END AS value
FROM   table_name

Which outputs:

ID | VALUE                                               
-: | :---------------------------------------------------
 1 | 1111 = 2222 ;  3333 in 4444 ;  5555 sum (6666; 7777)
 2 | ABCD                                                
 3 | A ;  ; C ; D ;  ; F(F;F;F) ;                        
 4 | null                                                
 5 | abc ;  (e;f) = ( e; f)                              
 6 | (abc; (def;fff) ;  zzyzx)                           
 7 | (((abc))) ; (def)) ; ((ghi;jkl) ; mno) ; pqr        
 8 |  ;                                                  
 9 |  ; A ;                                              
10 | ()(;) ; (;)(;)                                      
11 | (a ; b) ; (c ; d)

db<>fiddle here

I already have a function to split after.

Please check your split function against case #11.

Oh - and I just figured out how our approaches are different. In finding tokens, I look first for parenthesized substring, and only then for non-semicolon characters. This allows me to not care what's in the parenthesized substring (I don't need to single out semicolon in parentheses). I believe our solutions may give different answers if there are unbalanced parentheses in the input - in which case, perhaps, the output is irrelevant as the input is corrupted anyway.
@mathguy Was going to fix my original (regular expression based) solution and then went for a completely different solution (effectively a tokenizer, without regular expressions) that solves the problem of nested braces.
Thank you. This solution is awesome but I made some mistake in my request and I don't need all this. Can you help me with my new request edit ?

score 0 · Accepted Answer · 2020-11-07 17:51:04Z

Here is one way to do this. The idea is simple: for each token, collect either an opening parenthesis, then any number of non-closing-parenthesis characters, then a closing parenthesis, OR a single non-semicolon. Collect ANY NUMBER of consecutive occurrences of the fragments I just described, followed either by semicolon or by the end of the string. That's a "token".

Note that this will not work correctly if you may have nested parentheses. To illustrate that, I included an example in my sample data. I also included a row where the "value" is NULL from the outset; presumably that should still produce a row in the output, with NULL token.

If my testing is correct, this will produce the same answer as MT0's, except when the input string is NULL (his solution won't produce any output for such inputs).

Sample data

create table sample_data (id, value) as
  select 1, '1111 = 2222; 3333 in 4444; 5555 sum (6666; 7777)'
                                      from dual union all
  select 2, 'ABCD'                    from dual union all
  select 3, null                      from dual union all
  select 4, 'A;;C;D;;F(F;F;F);'       from dual union all
  select 5, 'abc; (e;f) = ( e; f)'    from dual union all
  select 6, '(abc; (def;fff); zzyzx)' from dual
;

Query and output

select i.id, t.ord, t.token
from   sample_data i
       cross apply
       (
         select  level as ord,
                 regexp_substr(i.value, '((\([^)]*\)|[^;])*)(;|$)', 1,
                               level, null, 1) as token
         from    dual
         connect by level <= regexp_count(i.value, '((\([^)]*?\)|[^;])*)(;|$)')
                             - case when i.value like '%;' then 0 else 1 end
       ) t
;

ID ORD TOKEN                 
-- --- ----------------------
 1   1 1111 = 2222           
 1   2  3333 in 4444         
 1   3  5555 sum (6666; 7777)
 2   1 ABCD                  
 3   1                       
 4   1 A                     
 4   2                       
 4   3 C                     
 4   4 D                     
 4   5                       
 4   6 F(F;F;F)              
 4   7                       
 5   1 abc                   
 5   2  (e;f) = ( e; f)      
 6   1 (abc; (def;fff)     -- NOT the required result, when there are     
 6   2  zzyzx)             -- nested parentheses in the input!

Collectives™ on Stack Overflow

How to process this string by regex in oracle plsql

2 Answers 2

Update

3 Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Update

3 Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related