For those of you who have upvoted (or saved) quite a bunch of posts with Imgur-hosted images over on reddit.com and know their way around Python, there's a script that allows you to download images that are embedded within posts. Amongst others it will download images from Imgur (it also download e.g. images directly embedded into posts).
on line 160, you can switch between saved and upvoted posts.
I'll also put the code below. I don't expect pastebin to shut down anytime soon, but better save then sorry.
Code:
import requests
from datetime import datetime
import os
import pandas as pd
import praw.models
import urllib.request
import logging, sys
from bs4 import *
# How to use:
# 1. Install python3 - https://www.python.org/downloads/
# 2. Open "Command prompt" on your PC and copy and paste: `pip install pandas, bs4, urllib` (Without quotes)
# 3. Fill in details below (Link explains how)
# 4. Run it this file, and it will download your last 1000 upvoted posts (1000 is the max set by reddit)
# Fails to work for: Redgifs, bdsmlr
# Downloads crap along with correct post for: gfycat (Also fails half of the time)
# Fill in your details here
# https://praw.readthedocs.io/en/stable/getting_started/authentication.html#password-flow
reddit = praw.Reddit(
client_id="",
client_secret="",
password="",
user_agent="Downloads images from /u/<username>/upvoted before Imgur deletes them all",
username=""
)
column_list = ["title", "post_url", "user", "image_url", "image_loc", "notes"]
upvoted_df = pd.DataFrame(data=None,
index=None,
columns=column_list,
dtype=None,
copy=None
)
def clean_title(submission_title: str) -> str:
"""
Remove all values not allowed in Windows file names
Makes name shorter than max file length Windows allows
:param submission_title:
:return:
"""
for bad_char in list('\\/:*?\"<>|'):
submission_title = submission_title.replace(bad_char, "#")
return submission_title[:180]
def download_images(url, folder_name) -> None:
"""
Download all images from URL
From:
https://www.geeksforgeeks.org/how-to-download-all-images-from-a-web-page-in-python/
#TODO Doens't work with redgifs
:param url: URL to download all images from
:param folder_name: Relative folder destination for images
:return:
"""
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
for thing in ["img", "video"]:
# find all images in URL
images = soup.findAll(thing, limit=100)
# initial count is zero
count = 0
# print total images found in URL
print(f"Total {len(images)} {thing} Found!")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
# Warning is fine as it is in try/except
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
# After checking above condition, Image Download start
with open(f"{folder_name}/image{i + 1}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
count += 1
except:
print(f"Could not find content for '{image_link}' ({requests.get(image_link)})")
pass
# There might be possible, that all
# images not download
# if all images download
if count == len(images):
print(f"All {thing}s Downloaded!")
# if all images not download
else:
print(f"Total {count} {thing}s downloaded out of {len(images)}")
print(f"Downloading upvoted posts for: {reddit.user.me()}")
current_redditor: praw.models.Redditor = reddit.user.me()
cwd = os.path.dirname(__file__)
try:
os.mkdir(f"{cwd}/images/")
except FileExistsError:
print("/images/ already exists")
try:
os.mkdir(f"{cwd}/posts/")
except FileExistsError:
print(f"/posts/ already exists")
# Max limit that PRAW allows easily (1000)
for counter, submission in enumerate(current_redditor.saved(limit=1000)):
try:
submission: praw.models.Submission # So editor knows
filetype: str
to_append: pd.DataFrame
if not hasattr(submission, 'title'):
print("Found a comment.")
print(submission.link_title)
print(submission.link_permalink)
print("Skipping.")
continue
title = clean_title(submission.title)
title_with_counter = f"{counter}-{title}"
author = submission.author
if author is None:
author = "[deleted]"
else:
author = submission.author.name
# If a url link
if submission.selftext == "":
# If image/video link
# https://help.imgur.com/hc/en-us/articles/115000083326-What-files-can-I-upload-Is-there-a-size-limit-
(_, filetype) = os.path.splitext(submission.url)
if filetype.upper() in [".PNG", ".GIF", ".JPG", ".JPEG", ".MP4", ".MPEG", ".AVI", ".WEBM", ".APNG", ".TIFF",
".MOV", ".QT", ".MKV",
".MK3D", ".MKA", ".MKS", ".FLV", ".F4V", ".F4P", ".F4A", ".F4B"]:
print(f"Directly Downloading: '{submission.url}' as {filetype}")
image_loc = f"{cwd}/images/{title_with_counter}.{filetype}"
# Save image
urllib.request.urlretrieve(submission.url, image_loc)
df_row = pd.DataFrame(
[
[
submission.title,
submission.permalink,
author,
submission.url,
image_loc,
"IMAGE"
]
],
columns=column_list)
# Non-Image url
# Download all images on page
else:
print(f"Downloading files on page for: '{submission.url}'")
image_folder_loc = f"{cwd}/images/{title_with_counter}/"
try:
os.mkdir(image_folder_loc)
except FileExistsError:
print(f"/images/{title_with_counter} already exists")
download_images(submission.url, image_folder_loc)
df_row = pd.DataFrame(
[
[
submission.title,
submission.permalink,
author,
submission.url,
image_folder_loc,
"IMAGE FOLDER"
]
],
columns=column_list)
# If non-url (text) post
# TODO could be Poll I guess
else:
print(f"Downloading Text For: '{submission.url}'")
txt_loc = f"{cwd}/posts/{counter}-{title}.txt"
with open(txt_loc, "w+") as file:
file.write(submission.selftext)
df_row = pd.DataFrame(
[
[
submission.title,
submission.permalink,
author,
"",
txt_loc,
"TEXT POST"
]
],
columns=column_list)
# Append to df
upvoted_df = pd.concat([upvoted_df, df_row])
except Exception:
print(f"Failed to download {submission.title}")
df_row = pd.DataFrame(
[
[
submission.title,
submission.permalink,
"FAILED",
"",
"FAILED",
"FAILED"
]
],
columns=column_list)
upvoted_df = pd.concat([upvoted_df, df_row])
upvoted_df.to_csv(f"{str(datetime.now()).replace(':', '-')}.csv")