uri_split.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. /* <!-- copyright */
  2. /*
  3. * aria2 - The high speed download utility
  4. *
  5. * Copyright (C) 2012 Tatsuhiro Tsujikawa
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. *
  21. * In addition, as a special exception, the copyright holders give
  22. * permission to link the code of portions of this program with the
  23. * OpenSSL library under certain conditions as described in each
  24. * individual source file, and distribute linked combinations
  25. * including the two.
  26. * You must obey the GNU General Public License in all respects
  27. * for all of the code used other than OpenSSL. If you modify
  28. * file(s) with this exception, you may extend this exception to your
  29. * version of the file(s), but you are not obligated to do so. If you
  30. * do not wish to do so, delete this exception statement from your
  31. * version. If you delete this exception statement from all source
  32. * files in the program, then also delete it here.
  33. */
  34. /* copyright --> */
  35. #include "uri_split.h"
  36. #include <stdlib.h>
  37. typedef enum {
  38. URI_BEFORE_SCHEME,
  39. URI_SCHEME,
  40. URI_SCHEME_SLASH1,
  41. URI_SCHEME_SLASH2,
  42. URI_BEFORE_MAYBE_USER,
  43. URI_MAYBE_USER,
  44. URI_BEFORE_MAYBE_PASSWD,
  45. URI_MAYBE_PASSWD,
  46. URI_BEFORE_HOST,
  47. URI_HOST,
  48. URI_BEFORE_IPV6HOST,
  49. URI_IPV6HOST,
  50. URI_AFTER_IPV6HOST,
  51. URI_BEFORE_PORT,
  52. URI_PORT,
  53. URI_PATH,
  54. URI_BEFORE_QUERY,
  55. URI_QUERY,
  56. URI_BEFORE_FRAGMENT,
  57. URI_FRAGMENT
  58. } uri_split_state;
  59. static void uri_set_field(uri_split_result* res, int field, const char* first,
  60. const char* last, const char* uri)
  61. {
  62. if (first) {
  63. res->field_set |= 1 << field;
  64. res->fields[field].off = first - uri;
  65. res->fields[field].len = last - first;
  66. }
  67. }
  68. static int is_digit(char c) { return '0' <= c && c <= '9'; }
  69. int uri_split(uri_split_result* res, const char* uri)
  70. {
  71. int state = URI_BEFORE_SCHEME;
  72. const char* scheme_first = NULL, * scheme_last = NULL, * host_first = NULL,
  73. * host_last = NULL, * path_first = NULL, * path_last = NULL,
  74. * query_first = NULL, * query_last = NULL,
  75. * fragment_first = NULL, * fragment_last = NULL,
  76. * user_first = NULL, * user_last = NULL, * passwd_first = NULL,
  77. * passwd_last = NULL, * last_atmark = NULL, * last_slash = NULL,
  78. * p = uri;
  79. int32_t port = -1;
  80. uint8_t flags = 0;
  81. for (; *p; ++p) {
  82. switch (state) {
  83. case URI_BEFORE_SCHEME:
  84. scheme_first = p;
  85. state = URI_SCHEME;
  86. break;
  87. case URI_SCHEME:
  88. if (*p == ':') {
  89. scheme_last = p;
  90. state = URI_SCHEME_SLASH1;
  91. }
  92. break;
  93. case URI_SCHEME_SLASH1:
  94. if (*p == '/') {
  95. state = URI_SCHEME_SLASH2;
  96. }
  97. else {
  98. return -1;
  99. }
  100. break;
  101. case URI_SCHEME_SLASH2:
  102. if (*p == '/') {
  103. state = URI_BEFORE_MAYBE_USER;
  104. }
  105. else {
  106. return -1;
  107. }
  108. break;
  109. case URI_BEFORE_MAYBE_USER:
  110. switch (*p) {
  111. case '@':
  112. case ':':
  113. case '/':
  114. return -1;
  115. case '[':
  116. state = URI_BEFORE_IPV6HOST;
  117. break;
  118. default:
  119. user_first = p;
  120. state = URI_MAYBE_USER;
  121. }
  122. break;
  123. case URI_MAYBE_USER:
  124. switch (*p) {
  125. case '@':
  126. last_atmark = p;
  127. break;
  128. case ':':
  129. user_last = p;
  130. state = URI_BEFORE_MAYBE_PASSWD;
  131. break;
  132. case '[':
  133. if (last_atmark == p - 1) {
  134. user_last = last_atmark;
  135. state = URI_BEFORE_IPV6HOST;
  136. }
  137. else {
  138. return -1;
  139. }
  140. break;
  141. case '/':
  142. case '?':
  143. case '#':
  144. /* It turns out that this is only host or user + host if
  145. last_atmark is not NULL. */
  146. if (last_atmark) {
  147. host_first = last_atmark + 1;
  148. host_last = p;
  149. user_last = last_atmark;
  150. }
  151. else {
  152. host_first = user_first;
  153. host_last = p;
  154. user_first = user_last = NULL;
  155. }
  156. switch (*p) {
  157. case '/':
  158. path_first = last_slash = p;
  159. state = URI_PATH;
  160. break;
  161. case '?':
  162. state = URI_BEFORE_QUERY;
  163. break;
  164. case '#':
  165. state = URI_BEFORE_FRAGMENT;
  166. break;
  167. }
  168. break;
  169. }
  170. break;
  171. case URI_BEFORE_MAYBE_PASSWD:
  172. passwd_first = p;
  173. switch (*p) {
  174. case '@':
  175. passwd_last = last_atmark = p;
  176. state = URI_BEFORE_HOST;
  177. break;
  178. case '/':
  179. return -1;
  180. default:
  181. /* sums up port number in case of port. */
  182. if (is_digit(*p)) {
  183. port = *p - '0';
  184. }
  185. state = URI_MAYBE_PASSWD;
  186. }
  187. break;
  188. case URI_MAYBE_PASSWD:
  189. switch (*p) {
  190. case '@':
  191. passwd_last = last_atmark = p;
  192. /* Passwd confirmed, reset port to -1. */
  193. port = -1;
  194. state = URI_BEFORE_HOST;
  195. break;
  196. case '[':
  197. return -1;
  198. case '/':
  199. case '?':
  200. case '#':
  201. /* This is port not password. port is in [passwd_first, p) */
  202. if (port == -1) {
  203. return -1;
  204. }
  205. if (last_atmark) {
  206. host_first = last_atmark + 1;
  207. host_last = passwd_first - 1;
  208. user_last = last_atmark;
  209. }
  210. else {
  211. host_first = user_first;
  212. host_last = passwd_first - 1;
  213. user_first = user_last = NULL;
  214. }
  215. passwd_first = passwd_last = NULL;
  216. switch (*p) {
  217. case '/':
  218. path_first = last_slash = p;
  219. state = URI_PATH;
  220. break;
  221. case '?':
  222. state = URI_BEFORE_QUERY;
  223. break;
  224. case '#':
  225. state = URI_BEFORE_FRAGMENT;
  226. break;
  227. }
  228. break;
  229. default:
  230. if (port != -1) {
  231. if (is_digit(*p)) {
  232. port *= 10;
  233. port += *p - '0';
  234. if (port > UINT16_MAX) {
  235. port = -1;
  236. }
  237. }
  238. else {
  239. port = -1;
  240. }
  241. }
  242. break;
  243. }
  244. break;
  245. case URI_BEFORE_HOST:
  246. switch (*p) {
  247. case ':':
  248. case '/':
  249. return -1;
  250. case '[':
  251. state = URI_BEFORE_IPV6HOST;
  252. break;
  253. default:
  254. host_first = p;
  255. state = URI_HOST;
  256. break;
  257. }
  258. break;
  259. case URI_HOST:
  260. switch (*p) {
  261. case ':':
  262. host_last = p;
  263. state = URI_BEFORE_PORT;
  264. break;
  265. case '/':
  266. host_last = path_first = last_slash = p;
  267. state = URI_PATH;
  268. break;
  269. case '?':
  270. host_last = p;
  271. state = URI_BEFORE_QUERY;
  272. break;
  273. case '#':
  274. host_last = p;
  275. state = URI_BEFORE_FRAGMENT;
  276. break;
  277. }
  278. break;
  279. case URI_BEFORE_IPV6HOST:
  280. if (*p == ']') {
  281. return -1;
  282. }
  283. host_first = p;
  284. state = URI_IPV6HOST;
  285. break;
  286. case URI_IPV6HOST:
  287. if (*p == ']') {
  288. flags |= USF_IPV6ADDR;
  289. host_last = p;
  290. state = URI_AFTER_IPV6HOST;
  291. }
  292. break;
  293. case URI_AFTER_IPV6HOST:
  294. switch (*p) {
  295. case ':':
  296. state = URI_BEFORE_PORT;
  297. break;
  298. case '/':
  299. path_first = last_slash = p;
  300. state = URI_PATH;
  301. break;
  302. case '?':
  303. state = URI_BEFORE_QUERY;
  304. break;
  305. case '#':
  306. state = URI_BEFORE_FRAGMENT;
  307. break;
  308. default:
  309. return -1;
  310. }
  311. break;
  312. case URI_BEFORE_PORT:
  313. if (is_digit(*p)) {
  314. port = *p - '0';
  315. state = URI_PORT;
  316. }
  317. else {
  318. return -1;
  319. }
  320. break;
  321. case URI_PORT:
  322. switch (*p) {
  323. case '/':
  324. path_first = last_slash = p;
  325. state = URI_PATH;
  326. break;
  327. case '?':
  328. state = URI_BEFORE_QUERY;
  329. break;
  330. case '#':
  331. state = URI_BEFORE_FRAGMENT;
  332. break;
  333. default:
  334. if (is_digit(*p)) {
  335. port *= 10;
  336. port += *p - '0';
  337. if (port > UINT16_MAX) {
  338. return -1;
  339. }
  340. }
  341. else {
  342. return -1;
  343. }
  344. }
  345. break;
  346. case URI_PATH:
  347. switch (*p) {
  348. case '/':
  349. last_slash = p;
  350. break;
  351. case '?':
  352. path_last = p;
  353. state = URI_BEFORE_QUERY;
  354. break;
  355. case '#':
  356. path_last = p;
  357. state = URI_BEFORE_FRAGMENT;
  358. break;
  359. }
  360. break;
  361. case URI_BEFORE_QUERY:
  362. query_first = p;
  363. if (*p == '#') {
  364. query_last = p;
  365. state = URI_BEFORE_FRAGMENT;
  366. }
  367. else {
  368. state = URI_QUERY;
  369. }
  370. break;
  371. case URI_QUERY:
  372. if (*p == '#') {
  373. query_last = p;
  374. state = URI_BEFORE_FRAGMENT;
  375. }
  376. break;
  377. case URI_BEFORE_FRAGMENT:
  378. fragment_first = p;
  379. state = URI_FRAGMENT;
  380. break;
  381. case URI_FRAGMENT:
  382. break;
  383. }
  384. }
  385. /* Handle premature states */
  386. switch (state) {
  387. case URI_BEFORE_SCHEME:
  388. case URI_SCHEME:
  389. case URI_SCHEME_SLASH1:
  390. case URI_SCHEME_SLASH2:
  391. return -1;
  392. case URI_BEFORE_MAYBE_USER:
  393. return -1;
  394. case URI_MAYBE_USER:
  395. if (last_atmark) {
  396. host_first = last_atmark + 1;
  397. host_last = p;
  398. if (host_first == host_last) {
  399. return -1;
  400. }
  401. user_last = last_atmark;
  402. }
  403. else {
  404. host_first = user_first;
  405. host_last = p;
  406. user_first = user_last = NULL;
  407. }
  408. break;
  409. case URI_BEFORE_MAYBE_PASSWD:
  410. return -1;
  411. case URI_MAYBE_PASSWD:
  412. if (port == -1) {
  413. return -1;
  414. }
  415. if (last_atmark) {
  416. host_first = last_atmark + 1;
  417. host_last = passwd_first - 1;
  418. user_last = last_atmark;
  419. }
  420. else {
  421. host_first = user_first;
  422. host_last = passwd_first - 1;
  423. user_first = user_last = NULL;
  424. }
  425. passwd_first = passwd_last = NULL;
  426. break;
  427. case URI_BEFORE_HOST:
  428. return -1;
  429. case URI_HOST:
  430. host_last = p;
  431. break;
  432. case URI_BEFORE_IPV6HOST:
  433. case URI_IPV6HOST:
  434. return -1;
  435. case URI_AFTER_IPV6HOST:
  436. break;
  437. case URI_BEFORE_PORT:
  438. return -1;
  439. case URI_PORT:
  440. if (port == -1) {
  441. return -1;
  442. }
  443. break;
  444. case URI_PATH:
  445. path_last = p;
  446. break;
  447. case URI_BEFORE_QUERY:
  448. query_first = query_last = p;
  449. break;
  450. case URI_QUERY:
  451. query_last = p;
  452. break;
  453. case URI_BEFORE_FRAGMENT:
  454. fragment_first = fragment_last = p;
  455. break;
  456. case URI_FRAGMENT:
  457. fragment_last = p;
  458. break;
  459. default:
  460. return -1;
  461. };
  462. if (res) {
  463. res->field_set = 0;
  464. res->port = 0;
  465. res->flags = flags;
  466. uri_set_field(res, USR_SCHEME, scheme_first, scheme_last, uri);
  467. uri_set_field(res, USR_HOST, host_first, host_last, uri);
  468. uri_set_field(res, USR_PATH, path_first, path_last, uri);
  469. uri_set_field(res, USR_QUERY, query_first, query_last, uri);
  470. uri_set_field(res, USR_FRAGMENT, fragment_first, fragment_last, uri);
  471. uri_set_field(res, USR_USER, user_first, user_last, uri);
  472. uri_set_field(res, USR_PASSWD, passwd_first, passwd_last, uri);
  473. if (res->field_set & (1 << USR_USER)) {
  474. uri_set_field(res, USR_USERINFO, user_first, last_atmark, uri);
  475. }
  476. if (last_slash && last_slash + 1 != path_last) {
  477. uri_set_field(res, USR_BASENAME, last_slash + 1, path_last, uri);
  478. }
  479. if (port != -1) {
  480. res->field_set |= 1 << USR_PORT;
  481. res->port = port;
  482. }
  483. }
  484. return 0;
  485. }