Often what we encounter in scraping is IP blocking, which is a condition where our IP is blocked when making requests to a site, there are several ways to bypass it.
- Using VPN
- Using Proxy
With these two ways, we can avoid our IP being blocked by the destination website when we send a lot of requests
Using VPN
in this case, we must have a VPN, there are several ways to get a VPN such as buying or we can get it for free there are also we activate the VPN on our host computer and then using it for scraping
Using Proxy
import requests
from base64 import b64encode
proxy = {
'http': 'http://173.208.208.74:60099'
}
class HTTPProxyAuth(requests.auth.HTTPBasicAuth):
"""Like requests.auth.HTTPBasicAuth, but adds a Proxy-Authorization header"""
def __call__(self, r):
auth_s = b64encode('%s:%s' % (self.username, self.password))
r.headers['Proxy-Authorization'] = ('Basic %s' % auth_s)
return r
auth = HTTPProxyAuth('user', 'password')
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()
print r.response
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()
print r.response
Using Proxy with Selenium
here if using selenium
import os
import random
import zipfile
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
def get_driver(use_proxy=False, user_agent: list = None, ip_proxy: str = None, port: int = None, username: str = None,
password: str = None, debug=True):
"""Selenium setup"""
options = Options()
if use_proxy:
print(f'Using Proxy: {ip_proxy}:{port}')
try:
os.mkdir('temp/plugins')
except FileExistsError:
pass
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = """
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "%s",
port: parseInt(%s)
},
bypassList: ["localhost"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "%s",
password: "%s"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
""" % (ip_proxy, port, username, password)
plugin_file = 'temp/plugins/proxy_auth.zip'
with zipfile.ZipFile(plugin_file, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
# raise Exception(plugin_file)
options.add_extension(plugin_file)
options.add_argument(argument=f'argument={random.choice(user_agent)}')
if not debug:
options.add_argument(argument='--headless')
driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
return driver
else:
options.add_argument(argument=f'user-agent={random.choice(user_agent)}')
options.add_argument(argument='--incognito')
if not debug:
options.add_argument(argument='--headless')
driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
return driver
Conclusion
Well, this way we can bypass and do scraping without having to worry about our IP being blocked, okay that’s it