Stealth web scraping in Python: Avoid blocking like a ninja | by Ander Rodriguez | Jul, 2021


Ander Rodriguez
pip install requests playwright
npx playwright install
import requests

response = requests.get('http://httpbin.org/ip')
print(response.json()['origin'])
# xyz.84.7.8

import requests

proxies = {'http': 'http://190.64.18.177:80'}
response = requests.get('http://httpbin.org/ip', proxies=proxies)
print(response.json()['origin']) # 190.64.18.162

import requests

response = requests.get('http://httpbin.org/headers')
print(response.json()['headers']['User-Agent'])
# python-requests/2.25.1

curl http://httpbin.org/headers
# { ... "User-Agent": "curl/7.74.0" ... }
import requests 

headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
response = requests.get('http://httpbin.org/headers', headers=headers)
print(response.json()['headers']['User-Agent']) # Mozilla/5.0 ...

import requests 
import random

user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
]
user_agent = random.choice(user_agents)
headers = {'User-Agent': user_agent}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json()['headers']['User-Agent'])
# Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) ...

{ 
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Host": "httpbin.org",
"Sec-Ch-Ua": ""Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-60ff12bb-55defac340ac48081d670f9d"
}
}
{
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Host": "httpbin.org",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
"X-Amzn-Trace-Id": "Root=1-60ff12e8-229efca73430280304023fb9"
}
}
import requests 
import random

headers_list = [{
'authority': 'httpbin.org',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
} # , {...}
]
headers = random.choice(headers_list)
response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json()['headers'])

import json 
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
# p.webkit is also supported, but there is a problem on Linux
for browser_type in [p.chromium, p.firefox]:
browser = browser_type.launch()
page = browser.new_page()
page.goto('https://httpbin.org/headers')
jsonContent = json.loads(page.inner_text('pre'))
print(jsonContent['headers']['User-Agent'])
browser.close()

# Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/93.0.4576.0 Safari/537.36
# Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0

browser.new_page(extra_http_headers={'User-Agent': '...'})
import requests 
from bs4 import BeautifulSoup

response = requests.get("https://scrapeme.live/shop/")
soup = BeautifulSoup(response.content, 'html.parser')
pages = soup.select(".woocommerce-pagination a.page-numbers:not(.next)")
print(pages[0].get('href')) # https://scrapeme.live/shop/page/2/
print(pages[-1].get('href')) # https://scrapeme.live/shop/page/48/

import sys 
import requests

session = requests.session()
response = session.get('http://instagram.com', allow_redirects=False)
print(response.status_code, response.headers.get('location'))
for redirect in session.resolve_redirects(response, response.request):
location = redirect.headers.get('location')
print(redirect.status_code, location)
if location and "accounts/login" in location:
sys.exit() # no need to exit, return would be enough
# 301 https://instagram.com/
# 301 https://www.instagram.com/
# 302 https://www.instagram.com/accounts/login/





Source link

Latest articles

Related articles

Leave a reply

Please enter your comment!
Please enter your name here