python – Trouble selecting functional proxies from a list of proxies quickly

UPDATE CODE 07-05-2022

I reworked my code posted on 07-01-2022 to output these data elements, business name, business categories and business website.

1.Casa Bianca Pizza Pie
Pizza, Italian Restaurants, Restaurants
Home
2.Palermo Italian Restaurant Pizza, Restaurants, Italian Restaurants no website 3.Crispy Crust Hollywood Location Pizza, Restaurants, Food Delivery Service http://vine.crispycrust.com/zgrid/proc/site/sitep.jsp 4.Crispy Crust Los Angeles/Glendale Location Pizza, Restaurant Delivery Service, Caterers http://crispycrust.com/ 5.Papa Johns Pizza Pizza, Restaurants, Take Out Restaurants https://locations.papajohns.com/united-states/ca/90034/los-angeles/9844-national-blvd?utm_source=yext-listings&utm_medium=referral&y_source=1_MTA2OTEzMTAtNTUzLWxvY2F0aW9uLndlYnNpdGU%3D

UPDATE CODE 07-01-2022

I noted that when using the free proxies errors were being thrown. I added the requests_retry_session function to handle this. I didn’t rework all your code, but I did make sure that I could query the site and produce results using a free proxy. You should be able to work my code into yours.

import random
import logging
import requests
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from proxy_checking import ProxyChecker
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy

current_proxy = ''

def requests_retry_session(retries=5,
                            backoff_factor=0.5,
                            status_force_list=(500, 502, 504),
                            session=None,
                            ):
    session = session or requests.Session()

    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_force_list,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def random_ssl_proxy_address():
    try:
        # Obtain a list of HTTPS proxies
        # Suppress the console debugging output by setting the log level
        req_proxy = RequestProxy(log_level=logging.ERROR, protocol=Protocol.HTTPS)

        # Obtain a random single proxy from the list of proxy addresses
        random_proxy = random.sample(req_proxy.get_proxy_list(), 1)

        return random_proxy[0].get_address()
    except AttributeError as e:
        pass


def proxy_verification(current_proxy_address):
    checker = ProxyChecker()
    proxy_judge = checker.check_proxy(current_proxy_address)
    proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
    return proxy_status


def get_proxy_address():
    global current_proxy
    random_proxy_address = random_ssl_proxy_address()
    current_proxy = random_proxy_address
    proxy_status = proxy_verification(random_proxy_address)
    if proxy_status is True:
        return
    else:
        print('Looking for a valid proxy address.')

        # this sleep timer is helping with some timeout issues
        # that were happening when querying
        sleep(randint(5, 10))

        get_proxy_address()


def fetch_resp(link, http_headers, proxy_url):

    response = requests_retry_session().get(link,
                                            headers=http_headers,
                                            allow_redirects=True,
                                            verify=True,
                                            proxies=proxy_url,
                                            timeout=(30, 45)
                                                  )
    print("status code", response.status_code)
    return response


def get_content(link, headers, proxy_urls):
    res = fetch_resp(link, headers, proxy_urls)
    soup = BeautifulSoup(res.text, "lxml")
    info_sections = soup.find_all('li', {'class': 'business-card'})
    for info_section in info_sections:
        shop_name = info_section.find('h2', {'class': 'title business-name'})
        print(shop_name.text)
        categories=", ".join([i.text for i in info_section.find_all('a', {'class': 'category'})])
        print(categories)
        business_website = info_section.find('a', {'class': 'website listing-cta action'})
        if business_website is not None:
            print(business_website['href'])
        elif business_website is None:
            print('no website')

get_proxy_address()
if len(current_proxy) != 0:
    print(current_proxy)

    base_url="https://www.yellowpages.com{}"
    current_url="https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los+Angeles%2C+CA"

    headers = {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1',
    }

    PROXIES = {
        'https': f"http://{current_proxy}"
    }

    get_content(current_url, headers, PROXIES)


PREVIOUS ANSWERS

06-30-2022:

During some testing I found a bug, so I updated my code to handle the bug.

06-28-2022:

You could use a proxy judge, which is used for testing the performance and the anonymity status of a proxy server.

The code below is from one of my previous answers.

import random
import logging
from time import sleep
from random import randint
from proxy_checking import ProxyChecker
from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy


current_proxy = ''


def random_ssl_proxy_address():
    try:
        # Obtain a list of HTTPS proxies
        # Suppress the console debugging output by setting the log level
        req_proxy = RequestProxy(log_level=logging.ERROR, protocol=Protocol.HTTPS)

        # Obtain a random single proxy from the list of proxy addresses
        random_proxy = random.sample(req_proxy.get_proxy_list(), 1)

        return random_proxy[0].get_address()
    except AttributeError as e:
        pass


def proxy_verification(current_proxy_address):
    checker = ProxyChecker()
    proxy_judge = checker.check_proxy(current_proxy_address)
    proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
    return proxy_status


def get_proxy_address():
    global current_proxy
    random_proxy_address = random_ssl_proxy_address()
    current_proxy = random_proxy_address
    proxy_status = proxy_verification(random_proxy_address)
    if proxy_status is True:
        return
    else:
        print('Looking for a valid proxy address.')

        # this sleep timer is helping with some timeout issues
        # that were happening when querying
        sleep(randint(5, 10))

        get_proxy_address()


get_proxy_address()
if len(current_proxy) != 0:
    print(f'Valid proxy address: {current_proxy}')
    # output
    Valid proxy address: 157.100.12.138:999

I noted today that the Python package HTTP_Request_Randomizer has a couple of Beautiful Soup path problems that need to be modified, because they currently don’t work in version 1.3.2 of HTTP_Request_Randomizer.

You need to modify line 27 in FreeProxyParser.py to this:

table = soup.find("table", attrs={"class": "table table-striped table-bordered"})

You need to modify line 27 in SslProxyParser.py to this:

table = soup.find("table", attrs={"class": "table table-striped table-bordered"})

I found another bug that needs to be fixed. This one is in the proxy_checking.py I had to add the line if url != None:

    def get_info(self, url=None, proxy=None):
        info = {}
        proxy_type = []
        judges = ['http://proxyjudge.us/azenv.php', 'http://azenv.net/', 'http://httpheader.net/azenv.php', 'http://mojeip.net.pl/asdfa/azenv.php']
        if url != None:
            try:
                response = requests.get(url, headers=headers, timeout=5)
                return response
            except:
                pass
        elif proxy != None:

Leave a Comment