Escape From Blocking IP Using Rotating Proxy in Python Yell.com Case Study

Often what we encounter in scraping is IP blocking, which is a condition where our IP is blocked when making requests to a site, there are several ways to bypass it.

Using VPN
Using Proxy

With these two ways, we can avoid our IP being blocked by the destination website when we send a lot of requests

Using VPN

in this case, we must have a VPN, there are several ways to get a VPN such as buying or we can get it for free there are also we activate the VPN on our host computer and then using it for scraping

Using Proxy

import requests
from base64 import b64encode

proxy = {
	'http': 'http://173.208.208.74:60099'
}

class HTTPProxyAuth(requests.auth.HTTPBasicAuth):
	"""Like requests.auth.HTTPBasicAuth, but adds a Proxy-Authorization header"""
	def __call__(self, r):
		auth_s = b64encode('%s:%s' % (self.username, self.password))
		r.headers['Proxy-Authorization'] = ('Basic %s' % auth_s)
		return r


auth = HTTPProxyAuth('user', 'password')
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()

print r.response
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()

print r.response

Using Proxy with Selenium

here if using selenium

import os
import random
import zipfile

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(use_proxy=False, user_agent: list = None, ip_proxy: str = None, port: int = None, username: str = None,
               password: str = None, debug=True):
    """Selenium setup"""
    options = Options()

    if use_proxy:
        print(f'Using Proxy: {ip_proxy}:{port}')
        try:
            os.mkdir('temp/plugins')
        except FileExistsError:
            pass

        manifest_json = """
{
   "version": "1.0.0",
   "manifest_version": 2,
   "name": "Chrome Proxy",
   "permissions": [
       "proxy",
       "tabs",
       "unlimitedStorage",
       "storage",
       "<all_urls>",
       "webRequest",
       "webRequestBlocking"
   ],
   "background": {
       "scripts": ["background.js"]
   },
   "minimum_chrome_version":"22.0.0"
}
"""

        background_js = """
var config = {
       mode: "fixed_servers",
       rules: {
       singleProxy: {
           scheme: "http",
           host: "%s",
           port: parseInt(%s)
       },
       bypassList: ["localhost"]
       }
   };
   chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
   return {
       authCredentials: {
           username: "%s",
           password: "%s"
       }
   };
}
chrome.webRequest.onAuthRequired.addListener(
           callbackFn,
           {urls: ["<all_urls>"]},
           ['blocking']
);
                       """ % (ip_proxy, port, username, password)

        plugin_file = 'temp/plugins/proxy_auth.zip'
        with zipfile.ZipFile(plugin_file, 'w') as zp:
            zp.writestr("manifest.json", manifest_json)
            zp.writestr("background.js", background_js)

        # raise Exception(plugin_file)
        options.add_extension(plugin_file)
        options.add_argument(argument=f'argument={random.choice(user_agent)}')
        if not debug:
            options.add_argument(argument='--headless')
        driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
        return driver

    else:
        options.add_argument(argument=f'user-agent={random.choice(user_agent)}')
        options.add_argument(argument='--incognito')
        if not debug:
            options.add_argument(argument='--headless')
        driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
        return driver

Conclusion

Well, this way we can bypass and do scraping without having to worry about our IP being blocked, okay that’s it