2

I have written a program to search for a given phrase in a paragraph and enclose the phrase with a curly braces in that paragraph. I have used BoyerMoore's Algorithm for searching purpose.In the same time i also need to enhance the performance of the program. Though i got the required output the performance is disastrous.

Here is the code:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;

public class BoyerMoore {
    static class Pair {
        public int start, end;

        Pair(int start, int end) {
            this.start = start;
            this.end = end;
        }

        public int weight() {
            return end - start;
        }

        public boolean contains(int point) {
            return start <= point && point <= end;
        }

        public int returnStart() {
            return start;
        }


    }

    static class Group {
        public List<Pair> pairs = new ArrayList<Pair>();
        public Pair maxWeight;

        Group(Pair start) {
            add(start);
        }

        Group(List<Pair> pairs) {
            for (Pair pair : pairs) {
                add(pair);
            }
        }

        public boolean contains(Pair pair) {
            for (Pair my : pairs) {
                if (my.contains(pair.start) || my.contains(pair.end))
                    return true;
            }
            return false;
        }

        public void add(Pair pair) {
            pairs.add(pair);
            if (maxWeight == null || maxWeight.weight() < pair.weight())
                maxWeight = pair;
        }
    }

    public static List<Integer> match(String pattern, String text) {
        List<Integer> matches = new ArrayList<Integer>();
        int m = text.length();
        int n = pattern.length();

        Map<Character, Integer> rightMostIndexes = preprocessForBadCharacterShift(pattern);
        int alignedAt = 0;
        while (alignedAt + (n - 1) < m) {
            for (int indexInPattern = n - 1; indexInPattern >= 0; indexInPattern--) {
                int indexInText = alignedAt + indexInPattern;
                char x = text.charAt(indexInText);
                char y = pattern.charAt(indexInPattern);
                if (indexInText >= m)
                    break;
                if (x != y) {
                    Integer r = rightMostIndexes.get(x);
                    if (r == null) {
                        alignedAt = indexInText + 1;
                    } else {
                        int shift = indexInText - (alignedAt + r);
                        alignedAt += shift > 0 ? shift : 1;
                    }
                    break;
                } else if (indexInPattern == 0) {
                    matches.add(alignedAt);
                    alignedAt++;
                }
            }
        }
        return matches;
    }

    private static Map<Character, Integer> preprocessForBadCharacterShift(
            String pattern) {
        Map<Character, Integer> map = new HashMap<Character, Integer>();
        for (int i = pattern.length() - 1; i >= 0; i--) {
            char c = pattern.charAt(i);
            if (!map.containsKey(c))
                map.put(c, i);
        }
        return map;
    }

    public static void main(String[] args) throws IOException {

        BufferedReader input = new BufferedReader(new InputStreamReader(
                System.in));

        ArrayList<String> ListOfAllPhrase = new ArrayList<String>();

        List<Pair> pairs = new ArrayList<Pair>();

        List<Group> groups = new ArrayList<Group>();
        ListOfAllPhrase.add("protein");
        ListOfAllPhrase.add("protein kinase");
        ListOfAllPhrase.add("protein kinase A anchor protein");
        ListOfAllPhrase.add("protein kinase A anchor proteins");
        ListOfAllPhrase.add("protein kinase A anchor protein activity");

        ListOfAllPhrase.add("IL-6");

        ListOfAllPhrase.add("SOX5");
        ListOfAllPhrase.add("NOX5");    

        System.out.println("Input a sentence: ");
        String line = input.readLine();
        char[] lineInChar = line.toCharArray();
        long startTime = System.currentTimeMillis();

        for (int i = 0; i < ListOfAllPhrase.size(); i++) {

            // offset.add((ListOfAllPhrase.get(i)).length());

            List<Integer> matches = match(ListOfAllPhrase.get(i).toLowerCase(),
                    line.toLowerCase());
            for (Integer integer : matches) {

                pairs.add(new Pair(integer, (ListOfAllPhrase.get(i)).length()
                        + integer));


            }

        }


        System.out.println("Total time taken: "
                + (System.currentTimeMillis() - startTime));

        for (Pair pair : pairs) {
            List<Group> intersects = new ArrayList<Group>();
            for (Group group : groups) {
                if (group.contains(pair)) {
                    intersects.add(group);
                }
            }

            if (intersects.isEmpty()) {
                groups.add(new Group(pair));
            } else {
                List<Pair> intervals = new ArrayList<Pair>();
                intervals.add(pair);
                for (Group intersect : intersects) {
                    intervals.addAll(intersect.pairs);
                }

                groups.removeAll(intersects);
                groups.add(new Group(intervals));
            }
        }
        StringBuilder newBuilder = new StringBuilder();
        int flag = 1;
        System.out.println(lineInChar.length);
        for (int a = 0; a <= lineInChar.length; a++) {

            for (Group group : groups) {

                if (a == group.maxWeight.start) {
                    newBuilder.append("{");
                    flag = 1;

                    break;
                }
                if (a == group.maxWeight.end && a == lineInChar.length) {
                    newBuilder.append("}");
                    flag = 0;
                    break;
                }
                if (a == lineInChar.length && a == group.maxWeight.end + 1) {
                    newBuilder.append("}");
                    flag = 0;
                    break;
                }

                if (a == group.maxWeight.end) {
                    newBuilder.append("}");
                    flag = 1;

                    break;
                }
            }
            if (flag == 0)
                continue;

            newBuilder.append(lineInChar[a]);
            flag = 1;

        }
        System.out.println("Final output: " + newBuilder);


    }
}

What can I implement or do to increase the performance of my program? Should i switch to another string search algorithm?

If anyone could help me with this?

1
  • How can anyone suggest without code about performance ? Commented Nov 17, 2012 at 8:32

1 Answer 1

1

I think you implemented the Boyer-Moore algorithm like it is described. Although I would suggest this:

  • Avoid 'Expensive' operations in a for loop. For instance the toLowerCase() operations in your main method. Rewrite the loop (33% speed gain in my test):

    for (int i = 0; i < ListOfAllPhrase.size(); i++) {
    
        // offset.add((ListOfAllPhrase.get(i)).length());
    
        List<Integer> matches = match(ListOfAllPhrase.get(i).toLowerCase(),
                line.toLowerCase());
        for (Integer integer : matches) {
    
            pairs.add(new Pair(integer, (ListOfAllPhrase.get(i)).length()
                    + integer));
        }
    }
    

    To :

    ArrayList<String> lowerCaseListOfPhrases = new ArrayList<String>(ListOfAllPhrase.size());
    for (String phrase : ListOfAllPhrase) {
        lowerCaseListOfPhrases.add(phrase.toLowerCase());
    }
    String lowerCaseLine = line.toLowerCase();
    for (String phrase : lowerCaseListOfPhrases) {
      List<Integer> matches = match(phrase, lowerCaseLine);
        for (Integer integer : matches) {
            pairs.add(new Pair(integer, phrase.length() + integer));
        }
    
    }
    
  • Take a look at this fast implementation (See http://algs4.cs.princeton.edu/53substring/BoyerMoore.java.html):

    public static List<Integer> match2(String pattern, String text) {
      List<Integer> result = new ArrayList<Integer>();
    
      int[] right = new int[256]; // Assuming a 256 character encoding
      for (int c = 0; c < 256; c++)
            right[c] = -1;
      for (int j = 0; j < pattern.length(); j++)
            right[pattern.charAt(j)] = j;
    
      int M = pattern.length();
      int N = text.length();
      int skip;
      for (int i = 0; i <= N - M; i += skip) {
            skip = 0;
            for (int j = M-1; j >= 0; j--) {
                  if (pattern.charAt(j) != text.charAt(i+j)) {
                        skip = Math.max(1, j - right[text.charAt(i+j)]);
                        break;
                  }
            }
            if (skip == 0) {    // found
              result.add(i);
              skip += pattern.length();
            }
      }
      return result;
    }
    

    I get a performance increase of +- 50% when executing this test:

    public static void main(String[] args) throws IOException {
    
      String phrase = "protein kinase A anchor protein activity";
      String txt = "This is a test protein kinase A anchor protein activityThis is a test protein kinase A anchor protein activityThis is ";
    
      List<Integer> result1 = null;
      List<Integer> result2 = null;
    
      long currentTime = System.currentTimeMillis();
      for (int i=0; i<1000000; i++) {
            result1 = match(phrase, txt);
      }
      System.out.println("ExecutionTime match: " + (System.currentTimeMillis() - currentTime));
    
      currentTime = System.currentTimeMillis();
      for (int i=0; i<1000000; i++) {
            result2 = match2(phrase, txt);
      }
      System.out.println("ExecutionTime match2: " + (System.currentTimeMillis() - currentTime));
    
      Assert.assertTrue(result1.equals(result2));
    
    }
    

    Output:

    ExecutionTime match: 5590
    ExecutionTime match2: 2663

    • If you do not mind about Boyer-Moore algorithm, please use Java built-in functionality:

    public static List match3(String pattern, String text) { List result = new ArrayList();

    int index = text.indexOf(pattern);
    while (index >= 0) {
        result.add(index);
        index = text.indexOf(pattern, index + 1);
    }
    return result;
    }
    
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.