import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

// --------------------------------------------------------------------

public class GS {

/* Run this as a standalone program with the command

   c:/polbio/sanparks/javarun GS
*/

private static Pattern patternDomainName;
private Matcher matcher;
private static final String DOMAIN_NAME_PATTERN
      = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}";
static {
patternDomainName = Pattern.compile(DOMAIN_NAME_PATTERN);
}

// --------------------------------------------------------------------

public static void main(String[] args) {
GS obj = new GS();
Set<String> result = obj.getDataFromGoogle("Kruger rhino poaching");

for(String temp : result){
   System.out.println(temp);
}
System.out.println(result.size());
}

// --------------------------------------------------------------------

public String getDomainName(String url){
String domainName = "";
matcher = patternDomainName.matcher(url);

if (matcher.find()) {
domainName = matcher.group(0).toLowerCase().trim();
}

return domainName;
}

// --------------------------------------------------------------------

private Set<String> getDataFromGoogle(String query) {
Set<String> result = new HashSet<String>();

/* The following works but returns a poor new article search.
*/

String request = "https://www.google.com/search?q=" + query + "&num=20";

/* All of the following aggregator sites including faroo.com refuse the request.
   Either they issue a 404 error or simply return garbage -- not even null.
   Bing apparently has an API that should be investigated, see

   https://stackoverflow.com/questions/17250897/bing-api-sample-code

String request = "https://www.news.google.com/search?q=" + query + "&num=20";
String request = "https://www.bing.com/news/search?q=" + query + "&num=20";
String request = "https://www.bing.com/news/search?q=" + query;
String request = "https://www.newslookup.com/news/search?q=" + query;
*/

/* Faroo returns stories that are several years old.  Does not seem useful to
   puruse -- and doesn't work either.
String request = "http://www.faroo.com/#q=kruger%20rhino%20poaching&s=1&l=en&src=web";
*/

System.out.println("Sending request..." + request);

try {
// need http protocol, set this as a Google bot agent :)
Document doc = null;
doc = Jsoup.connect(request)
//.userAgent("Mozilla")  
.userAgent(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
.referrer("https://www.google.com")
.timeout(20*1000)
.get();

Elements links = doc.select("a[href]");
for (Element link : links) {

   String temp = link.attr("href");
   if (temp.startsWith("/url?q=")) {
      result.add(getDomainName(temp));

      // Load the story's page.

      try {
      String absHref = link.attr("abs:href");
      System.out.println("\nlink= " + absHref);

      Document doc1 = Jsoup
      .connect(absHref)
      .userAgent(
      "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
      .referrer("http://www.google.com")
      .timeout(5000)
      .get();

      /* On a typical query to Google, this loop generates an output file
         that is about 2 Mbytes.  A bit large, but not a big deal.  Using
	 "doc1.body().text()" gives a smaller file but there are few dates
	 left.  Using ".html()" is about what the VB script "innerhtml"
	 command was doing so this code will function in a similar manner
	 once a complete Java, Jsoup replacement for getstories.vbs is
	 completed based on this class. */

      String alltext = doc1.body().html();
      System.out.println("text= " + alltext);
      } catch (IOException e) {
      e.printStackTrace();
      }
   }
}
} catch (IOException e) {
System.out.println("connect failed");
e.printStackTrace();
}

return result;
}
}
