suumo-search

Perform advanced searches on Suumo.jp
git clone https://git.neuralcrash.com/suumo-search.git
Log | Files | Refs | README

commit ac70b239e06be73985c506df4f8542840d4311ad
parent 21c7a15aed8412c27ea974e1168e45456b913cd2
Author: Kebigon <git@kebigon.xyz>
Date:   Fri, 13 Mar 2020 20:35:51 +0900

Work-around Suumo bot detection
Diffstat:
Msrc/main/java/xyz/kebigon/housesearch/browser/Browser.java | 16++++++++++++++--
Msrc/main/java/xyz/kebigon/housesearch/browser/suumo/SuumoBrowser.java | 4++++
2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/main/java/xyz/kebigon/housesearch/browser/Browser.java b/src/main/java/xyz/kebigon/housesearch/browser/Browser.java @@ -15,7 +15,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public abstract class Browser implements Closeable { - private final WebDriver driver; + private WebDriver driver; protected Browser() { @@ -34,6 +34,16 @@ public abstract class Browser implements Closeable driver.navigate().to(url); } + protected void restartBrowser() + { + final String url = driver.getCurrentUrl(); + log.info("Restarting browser, navigate to {}", url); + + driver.quit(); + driver = new HtmlUnitDriver(); + driver.navigate().to(url); + } + protected List<WebElement> findElements(String xpathExpression) { return driver.findElements(By.xpath(xpathExpression)); @@ -43,7 +53,9 @@ public abstract class Browser implements Closeable { try { - driver.findElement(By.xpath(xpathExpression)).click(); + final WebElement element = driver.findElement(By.xpath(xpathExpression)); + log.info("Click on {}, navigate to {}", element.getText(), element.getAttribute("href")); + element.click(); return true; } catch (final NoSuchElementException e) diff --git a/src/main/java/xyz/kebigon/housesearch/browser/suumo/SuumoBrowser.java b/src/main/java/xyz/kebigon/housesearch/browser/suumo/SuumoBrowser.java @@ -24,6 +24,10 @@ public class SuumoBrowser extends Browser do { + // Suumo detected us as a bot, we need to restart the browser + while (!findElements("//div[@class='l-error']").isEmpty()) + restartBrowser(); + postings.addAll(findElements("//div[@class='property_unit-content']").parallelStream() // .map(SuumoBrowser::createPosting)// .filter(sentPostings::notSent) //