Instagram Grabber PHP
1. List top images by hashtag<?php
function scrape_insta_hash($tag) {
$insta_source = file_get_contents('https://www.instagram.com/explore/tags/'.$tag.'/'); // instagrame tag url
$shards = explode('window._sharedData = ', $insta_source);
$insta_json = explode(';</script>', $shards);
$insta_array = json_decode($insta_json, TRUE);
return $insta_array; // this return a lot things print it and see what else you need
}
$tag = 'visitvietnam'; // tag for which ou want images
$results_array = scrape_insta_hash($tag);
$limit = 10; // provide the limit thats important because one page only give some images then load more have to be clicked
$image_array= array(); // array to store images.
for ($i=0; $i < $limit; $i++) {
$latest_array = $results_array['entry_data']['TagPage']['tag']['media']['nodes'][$i];
$image_data= '<img src="'.$latest_array['thumbnail_src'].'">'; // thumbnail and same sizes
//$image_data= '<img src="'.$latest_array['display_src'].'">'; actual image and different sizes
array_push($image_array, $image_data);
}
foreach ($image_array as $image) {
echo $image;// this will echo the images wrap it in div or ul li what ever html structure
}
// for getting all images have to loop function for more pages
// for confirmationyou are getting correct images view
//https://www.instagram.com/explore/tags/your-tag-name/
?>
Bulk Downloader from any public instagram users.
1.Login the instagram, export the cookies for your login session.
2. Create a script called: instagram_dl.php
<?php
include("includes/rollingcurlx.class.php");
/**
* Crawls through user's page and downloads all avaliable images/videos
*/
function download($RCX, $username, $max_id = 0) {
$id = '';
$lastId = '';
if ($max_id > 0) {
$id = $max_id;
}
$userURL = "https://www.instagram.com/" . $username . "/media/?&max_id=" . $id;
$ch = curl_init();
$curl_options = array(
CURLOPT_URL => $userURL,
CURLOPT_REFERER => "https://www.instagram.com",
CURLOPT_USERAGENT => "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 (.NET CLR 3.5.30729)",
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_HTTPHEADER => array('Content-type: application/json'),
CURLOPT_COOKIEFILE => __DIR__ . "/cookies.txt",
CURLOPT_SSL_VERIFYPEER => false
);
curl_setopt_array($ch, $curl_options);
$response = curl_exec($ch);
if(empty($response)){
die("API returned nothing\r\n");
}
curl_close($ch);
$json = json_decode($response, true);
if($json['status'] == "ok" && !empty($json['items'])) {
// Loop over json, get the filename, URL and timestamp
foreach ($json['items'] as $data) {
if($data['type'] == "video") {
$imageURL = $data['videos']['standard_resolution']['url'];
$name = explode("/", $imageURL);
$name = $name;
} else {
$urlSplit = explode("/",
$data['images']['standard_resolution']['url']);
$name = $urlSplit;
// Some images have URLs of different lengths
// "/s1080x1080/" ensures the image is the largest possible
if(count($urlSplit) == 6) {
$imageURL = $urlSplit . "//" . $urlSplit
. "/" . $urlSplit . "/" . $urlSplit . "/"
. $urlSplit;
} elseif(count($urlSplit) == 7) {
$imageURL = $urlSplit . "//" . $urlSplit
. "/" . $urlSplit . "/s1080x1080/" . $urlSplit
. "/" . $urlSplit;
} elseif(count($urlSplit) == 8) {
$imageURL = $urlSplit . "//" . $urlSplit
. "/" . $urlSplit . "/" . $urlSplit
. "/s1080x1080/" . $urlSplit . "/" . $urlSplit;
} elseif(count($urlSplit) == 9) {
$imageURL = $urlSplit . "//" . $urlSplit
. "/" . $urlSplit . "/" . $urlSplit
. "/s1080x1080/" . $urlSplit . "/" . $urlSplit
. "/" . $urlSplit;
} else {
$imageURL = $data['images']['standard_resolution']['url'];
}
}
// Add image to download queue
$RCX->addRequest($imageURL, null, 'save', ['fileName' => $name, 'created_time' => $data['created_time'], 'username' => $username]);
// Instagram only shows one page of images at a given time, saves the id of the last image
$lastId = $data['id'];
}
} else {
die("Invalid username or private account.\r\n");
}
// Recurse if more images are avaliable
if($json['more_available'] == true){
return download($RCX, $username, $lastId);
} else {
$RCX->setOptions([array(
CURLOPT_REFERER => "http://instagram.com",
CURLOPT_USERAGENT => "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 (.NET CLR 3.5.30729)",
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 10,
)]);
$RCX->execute();
}
}
function save($response, $url, $request_info, $user_data, $time) {
$saveto = "./" . $user_data['username'] . "/";
// Create user's folder
if(!file_exists($saveto)) {
if (!mkdir($saveto, 0744, true)) {
die(date("Y-m-d H:i:s") . " - Failed to create folder.\r\n");
}
}
$fileName = $user_data['fileName'];
$timestamp = $user_data['created_time'];
// Instagram API sometimes gives weird file names
if(strpos($fileName, "ig_cache_key")) {
$fileName = explode("?", $fileName);
}
$fileLocation = $saveto . $timestamp . "_" . $fileName;
if(!file_exists($fileLocation)) {
// Check error code
if($request_info['http_code'] == "200") {
echo("[" . $request_info['http_code'] . "] " .date("Y-m-d H:i:s") . " - saved " . $fileName . "\r\n");
} else {
echo("[" . $request_info['http_code'] . "] " .date("Y-m-d H:i:s") . " - Error downloading " . $fileName . " @ " . $url . "\r\n");
return;
}
$fp = fopen($fileLocation, 'w');
fwrite($fp, $response);
fclose($fp);
}
}
if(!isset($argv) || empty($argv)) {
die("Usage: php " . $_SERVER["SCRIPT_FILENAME"] . " <username>\r\n");
}
if (!function_exists('curl_init')) {
die(date("Y-m-d H:i:s") . " - cURL is not installed.\r\n");
}
download(new RollingCurlX(10), $argv, 0);
?>
3. Create a scripted called:rollingcurlx.class.php put into the includes/ folder
<?php
/*
---------- RollingCurlX 1.0.0 -----------
an easy to use curl_multi wrapper for php
Copyright (c) 2015 Marcus Leath
License: MIT
https://github.com/marcushat/RollingCurlX
*/
Class RollingCurlX {
private $_maxConcurrent = 0; //max. number of simultaneous connections allowed
private $_options = []; //shared cURL options
private $_headers = []; //shared cURL request headers
private $_callback = NULL; //default callback
private $_timeout = 5000; //all requests must be completed by this time
public $requests = []; //request_queue
function __construct($max_concurrent = 10) {
$this->setMaxConcurrent($max_concurrent);
}
public function setMaxConcurrent($max_requests) {
if($max_requests > 0) {
$this->_maxConcurrent = $max_requests;
}
}
public function setOptions(array $options) {
$this->_options = $options;
}
public function setHeaders(array $headers) {
if(is_array($headers) && count($headers)) {
$this->_headers = $headers;
}
}
public function setCallback(callable $callback) {
$this->_callback = $callback;
}
public function setTimeout($timeout) { //in milliseconds
if($timeout > 0) {
$this->_timeout = $timeout/1000; //to seconds
}
}
//Add a request to the request queue
public function addRequest(
$url,
$post_data = NULL,
callable $callback = NULL, //individual callback
$user_data = NULL,
array $options = NULL, //individual cURL options
array $headers = NULL //individual cURL request headers
) { //Add to request queue
$this->requests[] = [
'url' => $url,
'post_data' => ($post_data) ? $post_data : NULL,
'callback' => ($callback) ? $callback : $this->_callback,
'user_data' => ($user_data) ? $user_data : NULL,
'options' => ($options) ? $options : NULL,
'headers' => ($headers) ? $headers : NULL
];
return count($this->requests) - 1; //return request number/index
}
//Reset request queue
public function reset() {
$this->requests = [];
}
private function normalize_headers(array $headers) {
$normalized = [];
foreach($headers as $key => $header) {
if(is_string($key)) {
$normal = "$key: $header";
} else {
$header;
}
$normalized = [];
}
}
//Execute the request queue
public function execute() {
if(count($this->requests) < $this->_maxConcurrent) {
$this->_maxConcurrent = count($this->requests);
}
//the request map that maps the request queue to request curl handles
$requests_map = [];
$multi_handle = curl_multi_init();
//start processing the initial request queue
for($i = 0; $i < $this->_maxConcurrent; $i++) {
$this->init_request($i, $multi_handle, $requests_map);
}
do{
do{
$mh_status = curl_multi_exec($multi_handle, $active);
} while($mh_status == CURLM_CALL_MULTI_PERFORM);
if($mh_status != CURLM_OK) {
break;
}
//a request is just completed, find out which one
while($completed = curl_multi_info_read($multi_handle)) {
$this->process_request($completed, $multi_handle, $requests_map);
//add/start a new request to the request queue
if($i < count($this->requests) && isset($this->requests[$i])) { //if requests left
$this->init_request($i, $multi_handle, $requests_map);
$i++;
}
}
usleep(15); //save CPU cycles, prevent continuous checking
} while ($active || count($requests_map)); //End do-while
$this->reset();
curl_multi_close($multi_handle);
}
//Build individual cURL options for a request
private function buildOptions(array $request) {
$url = $request['url'];
$post_data = $request['post_data'];
$individual_opts = $request['options'];
$individual_headers = $request['headers'];
$options = ($individual_opts) ? $individual_opts + $this->_options : $this->_options; //merge shared and individual request options
$headers = ($individual_headers) ? $individual_headers + $this->_headers : $this->_headers; //merge shared and individual request headers
//the below will overide the corresponding default or individual options
$options = true;
$options = 1;
$options = max(1, $this->_timeout/1000); //minimum of 1 second
$options = $this->_timeout/1000;
if($url) {
$options = $url;
}
if($headers) {
$options = $headers;
}
// enable POST method and set POST parameters
if($post_data) {
$options = 1;
$options = is_array($post_data)? http_build_query($post_data) : $post_data;
}
return $options;
}
private function init_request($request_num, $multi_handle, &$requests_map) {
$request =& $this->requests[$request_num];
$this->addTimer($request);
$ch = curl_init();
$opts_set = curl_setopt_array($ch, $this->buildOptions($request));
if(!$opts_set) {
echo 'options not set';
exit;
}
curl_multi_add_handle($multi_handle, $ch);
//add curl handle of a new request to the request map
$ch_hash = (string) $ch;
$requests_map[$ch_hash] = $request_num;
}
private function process_request($completed, $multi_handle, array &$requests_map) {
$ch = $completed['handle'];
$ch_hash = (string) $ch;
$request =& $this->requests[$requests_map[$ch_hash]]; //map handler to request index to get request info
$request_info = curl_getinfo($ch);
$request_info['curle'] = $completed['result'];
$request_info['curle_msg'] = $this->curle_msgs[$completed['result']];
$request_info['handle'] = $ch;
$request_info['time'] = $time = $this->stopTimer($request); //record request time
$request_info['url_raw'] = $url = $request['url'];
$request_info['user_data'] = $user_data = $request['user_data'];
if(curl_errno($ch) !== 0 || intval($request_info['http_code']) !== 200) { //if server responded with http error
$response = false;
} else { //sucessful response
$response = curl_multi_getcontent($ch);
}
//get request info
$callback = $request['callback'];
$options = $request['options'];
if($response && (isset($this->_options) || isset($options))) {
$k = intval($request_info['header_size']);
$request_info['response_header'] = substr($response, 0, $k);
$response = substr($response, $k);
}
//remove completed request and its curl handle
unset($requests_map[$ch_hash]);
curl_multi_remove_handle($multi_handle, $ch);
//call the callback function and pass request info and user data to it
if($callback) {
call_user_func($callback, $response, $url, $request_info, $user_data, $time);
}
$request = NULL; //free up memory now just incase response was large
}
private function check_for_timeouts($mh) {
$now = microtime($true);
$request_map = $this->_request_map;
$requests = $this->_request_map;
foreach($request_maps as $ch_hash => $request_num) {
$request = $requests[$request_num];
$timeout = $request->timeout;
$start_time = $request->start_time;
$ch = $request->handle;
if($now >=$start_time + $timeout) {
curl_multi_remove_handle($mh, $ch);
}
}
}
private function addTimer(array &$request) { //adds timer object to request
$request['timer'] = microtime(true);
$request['time'] = false; //default if not overridden by time later
}
private function stopTimer(array &$request) {
$elapsed = $request['time'] = microtime(true) - $request['timer'];
unset($request['timer']);
return $elapsed;
}
private $curle_msgs = [CURLE_OK => 'OK', CURLE_UNSUPPORTED_PROTOCOL => 'UNSUPPORTED_PROTOCOL', CURLE_FAILED_INIT => 'FAILED_INIT', CURLE_URL_MALFORMAT => 'URL_MALFORMAT', CURLE_URL_MALFORMAT_USER => 'URL_MALFORMAT_USER', CURLE_COULDNT_RESOLVE_PROXY => 'COULDNT_RESOLVE_PROXY', CURLE_COULDNT_RESOLVE_HOST => 'COULDNT_RESOLVE_HOST', CURLE_COULDNT_CONNECT => 'COULDNT_CONNECT', CURLE_FTP_WEIRD_SERVER_REPLY => 'FTP_WEIRD_SERVER_REPLY', CURLE_FTP_ACCESS_DENIED => 'FTP_ACCESS_DENIED', CURLE_FTP_USER_PASSWORD_INCORRECT => 'FTP_USER_PASSWORD_INCORRECT', CURLE_FTP_WEIRD_PASS_REPLY => 'FTP_WEIRD_PASS_REPLY', CURLE_FTP_WEIRD_USER_REPLY => 'FTP_WEIRD_USER_REPLY', CURLE_FTP_WEIRD_PASV_REPLY => 'FTP_WEIRD_PASV_REPLY', CURLE_FTP_WEIRD_227_FORMAT => 'FTP_WEIRD_227_FORMAT', CURLE_FTP_CANT_GET_HOST => 'FTP_CANT_GET_HOST', CURLE_FTP_CANT_RECONNECT => 'FTP_CANT_RECONNECT', CURLE_FTP_COULDNT_SET_BINARY => 'FTP_COULDNT_SET_BINARY', CURLE_PARTIAL_FILE => 'PARTIAL_FILE', CURLE_FTP_COULDNT_RETR_FILE => 'FTP_COULDNT_RETR_FILE', CURLE_FTP_WRITE_ERROR => 'FTP_WRITE_ERROR', CURLE_FTP_QUOTE_ERROR => 'FTP_QUOTE_ERROR', CURLE_HTTP_NOT_FOUND => 'HTTP_NOT_FOUND', CURLE_WRITE_ERROR => 'WRITE_ERROR', CURLE_MALFORMAT_USER => 'MALFORMAT_USER', CURLE_FTP_COULDNT_STOR_FILE => 'FTP_COULDNT_STOR_FILE', CURLE_READ_ERROR => 'READ_ERROR', CURLE_OUT_OF_MEMORY => 'OUT_OF_MEMORY', CURLE_OPERATION_TIMEOUTED => 'OPERATION_TIMEOUTED', CURLE_FTP_COULDNT_SET_ASCII => 'FTP_COULDNT_SET_ASCII', CURLE_FTP_PORT_FAILED => 'FTP_PORT_FAILED', CURLE_FTP_COULDNT_USE_REST => 'FTP_COULDNT_USE_REST', CURLE_FTP_COULDNT_GET_SIZE => 'FTP_COULDNT_GET_SIZE', CURLE_HTTP_RANGE_ERROR => 'HTTP_RANGE_ERROR', CURLE_HTTP_POST_ERROR => 'HTTP_POST_ERROR', CURLE_SSL_CONNECT_ERROR => 'SSL_CONNECT_ERROR', CURLE_FTP_BAD_DOWNLOAD_RESUME => 'FTP_BAD_DOWNLOAD_RESUME', CURLE_FILE_COULDNT_READ_FILE => 'FILE_COULDNT_READ_FILE', CURLE_LDAP_CANNOT_BIND => 'LDAP_CANNOT_BIND', CURLE_LDAP_SEARCH_FAILED => 'LDAP_SEARCH_FAILED', CURLE_LIBRARY_NOT_FOUND => 'LIBRARY_NOT_FOUND', CURLE_FUNCTION_NOT_FOUND => 'FUNCTION_NOT_FOUND', CURLE_ABORTED_BY_CALLBACK => 'ABORTED_BY_CALLBACK', CURLE_BAD_FUNCTION_ARGUMENT => 'BAD_FUNCTION_ARGUMENT', CURLE_BAD_CALLING_ORDER => 'BAD_CALLING_ORDER', CURLE_HTTP_PORT_FAILED => 'HTTP_PORT_FAILED', CURLE_BAD_PASSWORD_ENTERED => 'BAD_PASSWORD_ENTERED', CURLE_TOO_MANY_REDIRECTS => 'TOO_MANY_REDIRECTS', CURLE_UNKNOWN_TELNET_OPTION => 'UNKNOWN_TELNET_OPTION', CURLE_TELNET_OPTION_SYNTAX => 'TELNET_OPTION_SYNTAX', CURLE_OBSOLETE => 'OBSOLETE', CURLE_SSL_PEER_CERTIFICATE => 'SSL_PEER_CERTIFICATE', CURLE_GOT_NOTHING => 'GOT_NOTHING', CURLE_SSL_ENGINE_NOTFOUND => 'SSL_ENGINE_NOTFOUND', CURLE_SSL_ENGINE_SETFAILED => 'SSL_ENGINE_SETFAILED', CURLE_SEND_ERROR => 'SEND_ERROR', CURLE_RECV_ERROR => 'RECV_ERROR', CURLE_SHARE_IN_USE => 'SHARE_IN_USE', CURLE_SSL_CERTPROBLEM => 'SSL_CERTPROBLEM', CURLE_SSL_CIPHER => 'SSL_CIPHER', CURLE_SSL_CACERT => 'SSL_CACERT', CURLE_BAD_CONTENT_ENCODING => 'BAD_CONTENT_ENCODING', CURLE_LDAP_INVALID_URL => 'LDAP_INVALID_URL', CURLE_FILESIZE_EXCEEDED => 'FILESIZE_EXCEEDED', CURLE_FTP_SSL_FAILED => 'FTP_SSL_FAILED', CURLE_SSH => 'SSH'
];
}
?>
4. Upload to PHP supported hosting server.
Download with command:
php instagram_dl.php <username>
5. Backup your instagram photos to the cloud. Trích xuất toàn bộ thông tin: userid, captions, likes của 1 tài khoản instagram
Extract all information of an account via:
https://www.instagram.com/instagram/?__a=1.
explore via tag
https://www.instagram.com/explore/tags/vietnam/?__a=1&max_id=600
via location
https://www.instagram.com/explore/locations/4021454/?__a=1
Dữ liệu trả về dạng JSON. Công việc kế tiếp là tìm công cụ chuyển từ json sang excel hay bất kỳ loại dữ liệu thô nào bạn muốn
Trang:
[1]