Skip to content

Commit 8104ce3

Browse files
authored
Merge pull request #537 from aliparlakci/development
2 parents 2d6e25d + f716d98 commit 8104ce3

9 files changed

Lines changed: 71 additions & 63 deletions

File tree

bdfr/resource.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,33 +30,7 @@ def __init__(self, source_submission: Submission, url: str, download_function: C
3030

3131
@staticmethod
3232
def retry_download(url: str) -> Callable:
33-
max_wait_time = 300
34-
35-
def http_download(download_parameters: dict) -> Optional[bytes]:
36-
current_wait_time = 60
37-
if 'max_wait_time' in download_parameters:
38-
max_wait_time = download_parameters['max_wait_time']
39-
else:
40-
max_wait_time = 300
41-
while True:
42-
try:
43-
response = requests.get(url)
44-
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
45-
return response.content
46-
elif response.status_code in (408, 429):
47-
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
48-
else:
49-
raise BulkDownloaderException(
50-
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
51-
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
52-
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
53-
time.sleep(current_wait_time)
54-
if current_wait_time < max_wait_time:
55-
current_wait_time += 60
56-
else:
57-
logger.error(f'Max wait time exceeded for resource at url {url}')
58-
raise
59-
return http_download
33+
return lambda global_params: Resource.http_download(url, global_params)
6034

6135
def download(self, download_parameters: Optional[dict] = None):
6236
if download_parameters is None:
@@ -82,3 +56,30 @@ def _determine_extension(self) -> Optional[str]:
8256
match = re.search(extension_pattern, stripped_url)
8357
if match:
8458
return match.group(1)
59+
60+
@staticmethod
61+
def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
62+
headers = download_parameters.get('headers')
63+
current_wait_time = 60
64+
if 'max_wait_time' in download_parameters:
65+
max_wait_time = download_parameters['max_wait_time']
66+
else:
67+
max_wait_time = 300
68+
while True:
69+
try:
70+
response = requests.get(url, headers=headers)
71+
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
72+
return response.content
73+
elif response.status_code in (408, 429):
74+
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
75+
else:
76+
raise BulkDownloaderException(
77+
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
78+
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
79+
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
80+
time.sleep(current_wait_time)
81+
if current_wait_time < max_wait_time:
82+
current_wait_time += 60
83+
else:
84+
logger.error(f'Max wait time exceeded for resource at url {url}')
85+
raise

bdfr/site_downloaders/erome.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44
import re
5-
from typing import Optional
5+
from typing import Callable, Optional
66

77
import bs4
88
from praw.models import Submission
@@ -29,7 +29,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
2929
for link in links:
3030
if not re.match(r'https?://.*', link):
3131
link = 'https://' + link
32-
out.append(Resource(self.post, link, Resource.retry_download(link)))
32+
out.append(Resource(self.post, link, self.erome_download(link)))
3333
return out
3434

3535
@staticmethod
@@ -43,3 +43,14 @@ def _get_links(url: str) -> set[str]:
4343
out.extend([vid.get('src') for vid in videos])
4444

4545
return set(out)
46+
47+
@staticmethod
48+
def erome_download(url: str) -> Callable:
49+
download_parameters = {
50+
'headers': {
51+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
52+
' Chrome/88.0.4324.104 Safari/537.36',
53+
'Referer': 'https://www.erome.com/',
54+
},
55+
}
56+
return lambda global_params: Resource.http_download(url, global_params | download_parameters)

bdfr/site_downloaders/youtube.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from pathlib import Path
66
from typing import Callable, Optional
77

8-
import youtube_dl
8+
import yt_dlp
99
from praw.models import Submission
1010

1111
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
@@ -45,9 +45,9 @@ def download(_: dict) -> bytes:
4545
download_path = Path(temp_dir).resolve()
4646
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
4747
try:
48-
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
48+
with yt_dlp.YoutubeDL(ytdl_options) as ydl:
4949
ydl.download([self.post.url])
50-
except youtube_dl.DownloadError as e:
50+
except yt_dlp.DownloadError as e:
5151
raise SiteDownloaderError(f'Youtube download failed: {e}')
5252

5353
downloaded_files = list(download_path.iterdir())
@@ -64,7 +64,7 @@ def download(_: dict) -> bytes:
6464
def get_video_attributes(url: str) -> dict:
6565
yt_logger = logging.getLogger('youtube-dl')
6666
yt_logger.setLevel(logging.CRITICAL)
67-
with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
67+
with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl:
6868
try:
6969
result = ydl.extract_info(url, download=False)
7070
return result

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ ffmpeg-python>=0.2.0
66
praw>=7.2.0
77
pyyaml>=5.4.1
88
requests>=2.25.1
9-
youtube-dl>=2021.3.14
9+
yt-dlp>=2021.9.25

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ description_file = README.md
44
description_content_type = text/markdown
55
home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit
66
keywords = reddit, download, archive
7-
version = 2.4.1
7+
version = 2.4.2
88
author = Ali Parlakci
99
author_email = parlakciali@gmail.com
1010
maintainer = Serene Arc

tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_can_handle_link(test_url: str, expected: bool):
2222
@pytest.mark.online
2323
@pytest.mark.slow
2424
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
25-
('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'),
25+
('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'),
2626
('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'),
2727
('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'),
2828
('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'),
@@ -34,4 +34,6 @@ def test_find_resources(test_url: str, expected_hash: str):
3434
resources = downloader.find_resources()
3535
assert len(resources) == 1
3636
assert isinstance(resources[0], Resource)
37+
for res in resources:
38+
res.download()
3739
assert resources[0].hash.hexdigest() == expected_hash
Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python3
22
# coding=utf-8
3-
3+
import re
44
from unittest.mock import MagicMock
55

66
import pytest
@@ -11,44 +11,37 @@
1111
@pytest.mark.online
1212
@pytest.mark.parametrize(('test_url', 'expected_urls'), (
1313
('https://www.erome.com/a/vqtPuLXh', (
14-
'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
14+
r'https://s\d+.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
1515
)),
1616
('https://www.erome.com/a/ORhX0FZz', (
17-
'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
18-
'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
19-
'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
20-
'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
21-
'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
22-
'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
23-
'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
17+
r'https://s\d+.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
18+
r'https://s\d+.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
19+
r'https://s\d+.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
20+
r'https://s\d+.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
21+
r'https://s\d+.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
22+
r'https://s\d+.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
23+
r'https://s\d+.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
2424
)),
2525
))
2626
def test_get_link(test_url: str, expected_urls: tuple[str]):
2727
result = Erome. _get_links(test_url)
28-
assert set(result) == set(expected_urls)
28+
assert all([any([re.match(p, r) for r in result]) for p in expected_urls])
2929

3030

3131
@pytest.mark.online
3232
@pytest.mark.slow
33-
@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
34-
('https://www.erome.com/a/vqtPuLXh', {
35-
'5da2a8d60d87bed279431fdec8e7d72f'
36-
}),
37-
('https://www.erome.com/a/lGrcFxmb', {
38-
'0e98f9f527a911dcedde4f846bb5b69f',
39-
'25696ae364750a5303fc7d7dc78b35c1',
40-
'63775689f438bd393cde7db6d46187de',
41-
'a1abf398cfd4ef9cfaf093ceb10c746a',
42-
'bd9e1a4ea5ef0d6ba47fb90e337c2d14'
43-
}),
33+
@pytest.mark.parametrize(('test_url', 'expected_hashes_len'), (
34+
('https://www.erome.com/a/vqtPuLXh', 1),
35+
('https://www.erome.com/a/4tP3KI6F', 1),
4436
))
45-
def test_download_resource(test_url: str, expected_hashes: tuple[str]):
37+
def test_download_resource(test_url: str, expected_hashes_len: int):
4638
# Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash
4739
# will change back and forth randomly
4840
mock_submission = MagicMock()
4941
mock_submission.url = test_url
5042
test_site = Erome(mock_submission)
5143
resources = test_site.find_resources()
52-
[res.download() for res in resources]
44+
for res in resources:
45+
res.download()
5346
resource_hashes = [res.hash.hexdigest() for res in resources]
54-
assert len(resource_hashes) == len(expected_hashes)
47+
assert len(resource_hashes) == expected_hashes_len

tests/site_downloaders/test_pornhub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
@pytest.mark.online
1313
@pytest.mark.slow
1414
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
15-
('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'),
15+
('https://www.pornhub.com/view_video.php?viewkey=ph6074c59798497', 'd9b99e4ebecf2d8d67efe5e70d2acf8a'),
1616
))
1717
def test_find_resources_good(test_url: str, expected_hash: str):
1818
test_submission = MagicMock()

tests/site_downloaders/test_youtube.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
@pytest.mark.online
1414
@pytest.mark.slow
1515
@pytest.mark.parametrize(('test_url', 'expected_hash'), (
16-
('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'),
17-
('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'),
16+
('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'),
17+
('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'),
18+
('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'), # Age restricted
1819
))
1920
def test_find_resources_good(test_url: str, expected_hash: str):
2021
test_submission = MagicMock()

0 commit comments

Comments
 (0)