DjangoBB LoFi version

Начало » Python для новичков » Спарсить прокси в тхт

1 2

sasholy

Авг. 21, 2018 17:10:28

Здравствуйте. Кто может отредактировать код, что бы можно было сохранять прокси в текстовый файл для дальнейшей работы, нужны https ip, нужны ли здесь какие либо проверки типа if, els, ну или может как улучшить код ? Благодарю.

 import requests
from bs4 import BeautifulSoup
from random import choice
def get_proxy():
    html = requests.get('https://free-proxy-list.net/').text
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='proxylisttable').find_all('tr')[1:11]
    proxies = []
    for tr in trs:
        tds = tr.find_all('td')
        ip = tds[0].text.strip()
        port = tds[1].text.strip()
        schema = 'https' if 'yes' in tds[6].text.strip() else 'http'
        proxy = {'schema': schema, 'address': ip + ':' + port}
        proxies.append(proxy)
    return choice(proxies)
def get_html(url):
    # proxies = {'https': 'ipaddress:5000'}
    p = get_proxy() # {'schema': '', 'address': ''}
    proxy = { p['schema']: p['address']  }
    r = requests.get(url, proxies=proxy, timeout=5)
    return r.json()['origin']
def main():
    url = 'http://checkip.dyndns.org'
    print(get_html(url))
if __name__ == '__main__':
    main()

sosok43k

Авг. 21, 2018 22:27:04

Могу, помочь, обращайтесь

Vigi

Авг. 22, 2018 08:59:44

 import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('div', {'class': 'table-responsive'}).find_all('td')
fl = 0
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res:
        print(i.text, file=f)
        fl += 1
        if fl == 8:
            print(file=f)
            fl = 0

sasholy

Авг. 22, 2018 10:28:02

Благодарю, не могли бы Вы допилить, что бы только https выбирало, можно было указать сколько надо проксей 20, 50, 150 и записывало в формате - ip:port?

  import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('div', {'class': 'table-responsive'}).find_all('td')
fl = 0
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res:
        print(i.text, file=f)
        fl += 1
        if fl == 8:
            print(file=f)
            fl = 0

Vigi

Авг. 22, 2018 13:02:00

 import requests
from bs4 import BeautifulSoup as bs
n = int(input(':>'))
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('tbody').find_all('tr')
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res[:n]:
        ip = i.find_all('td')[0].text
        port = i.find_all('td')[1].text
        print(ip, port, sep=':', file=f)

Vigi

Авг. 22, 2018 13:49:22

сори, про https забыл вот:

 import requests
from bs4 import BeautifulSoup as bs
n = int(input(':>'))
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('tbody').find_all('tr')
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res[:n]:
        if i.find_all('td')[6].text == 'yes':
            ip = i.find_all('td')[0].text
            port = i.find_all('td')[1].text
            print(ip, port, sep=':', file=f)

sasholy

Авг. 22, 2018 14:08:04

Vigi

куда вписывать число которое мне надо(50 - 100 прокси ip)?
не шарю в этом, учусь.

Vigi

Авг. 23, 2018 08:46:31

Тут:

sasholy

Авг. 29, 2018 13:48:17

Vigi
?

Как сделать, что бы работало через мой прокси лист и юзер агент в этом коде? если возможно улучшить код. Куда и какой код вставлять с моими txt списками? И как проверять и отбрасывать нерабочие прокси, что бы работало без пропусков запросов(сразу подменяло рабочим прокси)?

 import requests
from bs4 import BeautifulSoup
from random import choice, uniform
from time import sleep
import csv
def get_html(url):
    r = requests.get(url)
    return r.text
def write_csv(data):
    with open('cmc.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([data['name'],
                         data['price']])
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='currencies').find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        name = tds[1].find('a', class_='currency-name-container').text
        price = tds[3].find('a').get('data-usd')
        data = {'name': name,
                'price': price}
        write_csv(data)
def main():
    urla = 'https://coinmarketcap.com/{}'
    for i in range(0, 4):
        url = urla.format(str(i))
        get_page_data(get_html(url))
if __name__ == '__main__':
    main()

sasholy

Сен. 6, 2018 16:40:20

Что нужно сделать, не работает, сайт блокирует?

 import requests
from bs4 import BeautifulSoup
from random import choice, uniform
from time import sleep
import csv
def get_html(url, useragent=None, proxy=None):
    r = requests.get(url, headers=useragent, proxies=proxy)
    return r.text
def write_csv(data):
    with open('cmc.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([data['name'],
                         data['price']])
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='currencies').find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        name = tds[1].find('a', class_='currency-name-container').text
        price = tds[3].find('a').get('price')
        data = {'name': name,
                'price': price}
        write_csv(data)
def main():
    url = 'https://coinmarketcap.com/'
    useragents = open('useragents.txt').read().split('\n')
    proxies = open('proxies.txt').read().split('\n')
    for i in range(3):
        sleep(uniform(3, 6))
        proxy = {'http': 'http://' + choice(proxies)}
        useragent = {'User-Agent': choice(useragents)}
        try:
            html = get_html(url, useragent, proxy)
        except:
            continue
        get_page_data(get_html(url))
if __name__ == '__main__':
    main()