From 4e3619ad134fa36ce34c053be00bcd63019f81e2 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 00:00:25 -0800 Subject: [PATCH 1/6] Fix usage of deprecated URL constructors --- .../ripme/ripper/AbstractHTMLRipper.java | 55 +++++++++++-------- .../ripme/ripper/AbstractJSONRipper.java | 3 +- .../ripme/ripper/rippers/ChanRipper.java | 5 +- .../rippers/video/MotherlessVideoRipper.java | 8 ++- 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index e7b646e5a..3733fb153 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -28,7 +28,7 @@ * Simplified ripper, designed for ripping from sites by parsing HTML. */ public abstract class AbstractHTMLRipper extends AbstractRipper { - + private final Map itemsPending = Collections.synchronizedMap(new HashMap<>()); private final Map itemsCompleted = Collections.synchronizedMap(new HashMap<>()); private final Map itemsErrored = Collections.synchronizedMap(new HashMap<>()); @@ -60,11 +60,15 @@ protected Document getCachedFirstPage() throws IOException, URISyntaxException { public Document getNextPage(Document doc) throws IOException, URISyntaxException { return null; } - protected abstract List getURLsFromPage(Document page) throws UnsupportedEncodingException; + + protected abstract List getURLsFromPage(Document page) throws UnsupportedEncodingException, URISyntaxException; + protected List getDescriptionsFromPage(Document doc) throws IOException { throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function? } + protected abstract void downloadURL(URL url, int index); + protected DownloadThreadPool getThreadPool() { return null; } @@ -130,7 +134,7 @@ public void rip() throws IOException, URISyntaxException { List doclocation = new ArrayList<>(); LOGGER.info("Got doc location " + doc.location()); - + while (doc != null) { LOGGER.info("Processing a doc..."); @@ -167,7 +171,7 @@ public void rip() throws IOException, URISyntaxException { for (String imageURL : imageURLs) { index += 1; LOGGER.debug("Found image url #" + index + ": '" + imageURL + "'"); - downloadURL(new URL(imageURL), index); + downloadURL(new URI(imageURL).toURL(), index); if (isStopped() || isThisATest()) { break; } @@ -182,19 +186,26 @@ public void rip() throws IOException, URISyntaxException { if (isStopped() || isThisATest()) { break; } + textindex += 1; LOGGER.debug("Getting description from " + textURL); String[] tempDesc = getDescription(textURL,doc); + if (tempDesc != null) { - if (Utils.getConfigBoolean("file.overwrite", false) || !(new File( - workingDir.getCanonicalPath() - + "" - + File.separator - + getPrefix(index) - + (tempDesc.length > 1 ? tempDesc[1] : fileNameFromURL(new URL(textURL))) - + ".txt").exists())) { + URL url = new URI(textURL).toURL(); + String filename = fileNameFromURL(url); + + boolean fileExists = new File( + workingDir.getCanonicalPath() + + "" + + File.separator + + getPrefix(index) + + (tempDesc.length > 1 ? tempDesc[1] : filename) + + ".txt").exists(); + + if (Utils.getConfigBoolean("file.overwrite", false) || !fileExists) { LOGGER.debug("Got description from " + textURL); - saveText(new URL(textURL), "", tempDesc[0], textindex, (tempDesc.length > 1 ? tempDesc[1] : fileNameFromURL(new URL(textURL)))); + saveText(url, "", tempDesc[0], textindex, (tempDesc.length > 1 ? tempDesc[1] : filename)); sleep(descSleepTime()); } else { LOGGER.debug("Description from " + textURL + " already exists."); @@ -225,12 +236,12 @@ public void rip() throws IOException, URISyntaxException { } waitForThreads(); } - + /** * Gets the file name from the URL - * @param url + * @param url * URL that you want to get the filename from - * @return + * @return * Filename of the URL */ private String fileNameFromURL(URL url) { @@ -244,7 +255,7 @@ private String fileNameFromURL(URL url) { return saveAs; } /** - * + * * @param url * Target URL * @param subdirectory @@ -253,7 +264,7 @@ private String fileNameFromURL(URL url) { * Text you want to save * @param index * Index in something like an album - * @return + * @return * True if ripped successfully * False if failed */ @@ -295,12 +306,12 @@ private boolean saveText(URL url, String subdirectory, String text, int index, S } return true; } - + /** * Gets prefix based on where in the index it is - * @param index + * @param index * The index in question - * @return + * @return * Returns prefix for a file. (?) */ protected String getPrefix(int index) { @@ -313,9 +324,9 @@ protected String getPrefix(int index) { /* * ------ Methods copied from AlbumRipper. ------ - * This removes AlbumnRipper's usage from this class. + * This removes AlbumnRipper's usage from this class. */ - + protected boolean allowDuplicates() { return false; } diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java index 1d8e688a0..8b00cec37 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java @@ -8,6 +8,7 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; @@ -94,7 +95,7 @@ public void rip() throws IOException, URISyntaxException { index += 1; LOGGER.debug("Found image url #" + index+ ": " + imageURL); - downloadURL(new URL(imageURL), index); + downloadURL(new URI(imageURL).toURL(), index); } if (isStopped() || isThisATest()) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index f1d41426a..7551d198e 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -6,6 +6,7 @@ import com.rarchives.ripme.utils.RipUtils; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; @@ -208,7 +209,7 @@ private boolean isURLBlacklisted(String url) { return false; } @Override - public List getURLsFromPage(Document page) { + public List getURLsFromPage(Document page) throws URISyntaxException { List imageURLs = new ArrayList<>(); Pattern p; Matcher m; for (Element link : page.select("a")) { @@ -254,7 +255,7 @@ public List getURLsFromPage(Document page) { //Copied code from RedditRipper, getFilesFromURL should also implement stuff like flickr albums URL originalURL; try { - originalURL = new URL(href); + originalURL = new URI(href).toURL(); } catch (MalformedURLException e) { continue; } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/video/MotherlessVideoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/video/MotherlessVideoRipper.java index 6af8840ba..035ab73ef 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/video/MotherlessVideoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/video/MotherlessVideoRipper.java @@ -2,6 +2,8 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.List; import java.util.regex.Matcher; @@ -51,7 +53,7 @@ public String getGID(URL url) throws MalformedURLException { } @Override - public void rip() throws IOException { + public void rip() throws IOException, URISyntaxException { LOGGER.info(" Retrieving " + this.url); String html = Http.url(this.url).get().toString(); if (html.contains("__fileurl = '")) { @@ -62,7 +64,7 @@ public void rip() throws IOException { throw new IOException("Could not find video URL at " + url); } String vidUrl = vidUrls.get(0); - addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); + addURLToDownload(new URI(vidUrl).toURL(), HOST + "_" + getGID(this.url)); waitForThreads(); } -} \ No newline at end of file +} From 42efc815df0ddcd38d272e6b1a94c340a2a4f6a7 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 01:17:31 -0800 Subject: [PATCH 2/6] Fix issues in new LusciousRipper class --- .../com/rarchives/ripme/ripper/rippers/LusciousRipper.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java index 5637ed1b6..9a57b06f2 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java @@ -10,6 +10,8 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; @@ -27,10 +29,10 @@ public LusciousRipper(URL url) throws IOException { } @Override - public URL sanitizeURL(URL url) throws MalformedURLException { + public URL sanitizeURL(URL url) throws MalformedURLException, URISyntaxException{ String URLToReturn = url.toExternalForm(); URLToReturn = URLToReturn.replaceAll("https?://(?:www\\.)?luscious\\.", "https://old.luscious."); - URL san_url = new URL(URLToReturn); + URL san_url = new URI(URLToReturn).toURL(); LOGGER.info("sanitized URL is " + san_url.toExternalForm()); return san_url; } From df975433494231d80971df44bb19adad5c5a5554 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 01:28:57 -0800 Subject: [PATCH 3/6] README: Add note about --info so users can get the most out of gradle's test runs --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6334528cd..50885913c 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ the following combinations of tags: - testSlow runs tests with tag "slow". - tests can be run by test class, or single test. Use "testAll" so it does not matter if a test is tagged or not. +- tests can give the full stack of an assertion, exception, or error if you pass `--info` to the command ```bash ./gradlew test @@ -129,6 +130,7 @@ the following combinations of tags: ./gradlew testSlow ./gradlew testAll --tests XhamsterRipperTest ./gradlew testAll --tests XhamsterRipperTest.testXhamster2Album +./gradlew testAll --tests ChanRipperTest --info ``` Please note that some tests may fail as sites change and our rippers From 29d46491f9dbda8e8d9cab03662b3a904627f5a1 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 01:30:13 -0800 Subject: [PATCH 4/6] Fix an issue with the XvideosRipper found by URISyntaxException after refactor --- .../java/com/rarchives/ripme/ripper/rippers/XvideosRipper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/XvideosRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/XvideosRipper.java index ea19d484b..6f591d18b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/XvideosRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/XvideosRipper.java @@ -82,7 +82,7 @@ public List getURLsFromPage(Document doc) { String[] lines = e.html().split("\n"); for (String line : lines) { if (line.contains("html5player.setVideoUrlHigh")) { - String videoURL = line.replaceAll("\t", "").replaceAll("html5player.setVideoUrlHigh\\(", "").replaceAll("\'", "").replaceAll("\\);", ""); + String videoURL = line.strip().replaceAll("\t", "").replaceAll("html5player.setVideoUrlHigh\\(", "").replaceAll("\'", "").replaceAll("\\);", ""); results.add(videoURL); } } From b1e3771cc9f26b728b615f5ec6fd9e3300029080 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 02:13:20 -0800 Subject: [PATCH 5/6] Change to originalURL parsing resulted in a different exception if it's malformed, so handle those and refuse to rip --- .../java/com/rarchives/ripme/ripper/rippers/ChanRipper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index 7551d198e..c985f1612 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -256,7 +256,7 @@ public List getURLsFromPage(Document page) throws URISyntaxException { URL originalURL; try { originalURL = new URI(href).toURL(); - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException | IllegalArgumentException e) { continue; } From 692430cfcb2a5503c781d84acf2d532c079596eb Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Thu, 2 Jan 2025 02:29:08 -0800 Subject: [PATCH 6/6] Convert space to %20 before adding URL for later conversion --- .../java/com/rarchives/ripme/ripper/rippers/NudeGalsRipper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NudeGalsRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NudeGalsRipper.java index ea145aad3..ae9faaedc 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/NudeGalsRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NudeGalsRipper.java @@ -56,6 +56,7 @@ public List getURLsFromPage(Document doc) { for (Element thumb : thumbs) { String link = thumb.attr("src").replaceAll("thumbs/th_", ""); String imgSrc = "http://nude-gals.com/" + link; + imgSrc = imgSrc.replaceAll(" ", "%20"); imageURLs.add(imgSrc); }