I would love to scrape the titles of the top 250 movies (https://www.imdb.com/chart/top/) for educational purposes.
I have tried a lot of things but I messed up at the end every time. Could you please help me scrape the titles with Java and regex?
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class scraping {
public static void main (String args[]) {
try {
URL URL1=new URL("https://www.imdb.com/chart/top/");
URLConnection URL1c=URL1.openConnection();
BufferedReader br=new BufferedReader(new
InputStreamReader(URL1c.getInputStream(),"ISO8859_7"));
String line;int lineCount=0;
Pattern pattern = Pattern.compile("<td\\s+class=\"titleColumn\"[^>]*>"+ ".*?</a>");
Matcher matcher = pattern.matcher(br.readLine());
while(matcher.find()){
System.out.println(matcher.group());
}
} catch (Exception e) {
System.out.println("Exception: " + e.getClass() + ", Details: " + e.getMessage());
}
}
}
Thank you for your time.