xref: /haiku/src/add-ons/kernel/network/stack/net_socket.cpp (revision cfc3fa87da824bdf593eb8b817a83b6376e77935)
1 /*
2  * Copyright 2006-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Axel Dörfler, axeld@pinc-software.de
7  */
8 
9 
10 #include "stack_private.h"
11 
12 #include <net_protocol.h>
13 #include <net_stack.h>
14 #include <net_stat.h>
15 
16 #include <KernelExport.h>
17 #include <Select.h>
18 #include <team.h>
19 #include <util/AutoLock.h>
20 #include <util/list.h>
21 #include <fs/select_sync_pool.h>
22 
23 #include <new>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/time.h>
27 
28 
29 struct net_socket_private : net_socket {
30 	struct list_link		link;
31 	team_id					owner;
32 	uint32					max_backlog;
33 	uint32					child_count;
34 	struct list				pending_children;
35 	struct list				connected_children;
36 
37 	struct select_sync_pool	*select_pool;
38 	benaphore				lock;
39 };
40 
41 
42 void socket_delete(net_socket *socket);
43 int socket_bind(net_socket *socket, const struct sockaddr *address,
44 	socklen_t addressLength);
45 
46 
47 struct list sSocketList;
48 benaphore sSocketLock;
49 
50 
51 static size_t
52 compute_user_iovec_length(iovec *userVec, uint32 count)
53 {
54 	size_t length = 0;
55 
56 	for (uint32 i = 0; i < count; i++) {
57 		iovec vec;
58 		if (user_memcpy(&vec, userVec + i, sizeof(iovec)) < B_OK)
59 			return 0;
60 
61 		length += vec.iov_len;
62 	}
63 
64 	return length;
65 }
66 
67 
68 static void
69 delete_children(struct list *list)
70 {
71 	while (true) {
72 		net_socket_private *child
73 			= (net_socket_private *)list_remove_head_item(list);
74 		if (child == NULL)
75 			break;
76 
77 		child->parent = NULL;
78 		socket_delete(child);
79 	}
80 }
81 
82 
83 static status_t
84 create_socket(int family, int type, int protocol, net_socket_private **_socket)
85 {
86 	struct net_socket_private *socket = new (std::nothrow) net_socket_private;
87 	if (socket == NULL)
88 		return B_NO_MEMORY;
89 
90 	memset(socket, 0, sizeof(net_socket_private));
91 	socket->family = family;
92 	socket->type = type;
93 	socket->protocol = protocol;
94 
95 	status_t status = benaphore_init(&socket->lock, "socket");
96 	if (status < B_OK)
97 		goto err1;
98 
99 	// set defaults (may be overridden by the protocols)
100 	socket->send.buffer_size = 65535;
101 	socket->send.low_water_mark = 1;
102 	socket->send.timeout = B_INFINITE_TIMEOUT;
103 	socket->receive.buffer_size = 65535;
104 	socket->receive.low_water_mark = 1;
105 	socket->receive.timeout = B_INFINITE_TIMEOUT;
106 
107 	list_init_etc(&socket->pending_children, offsetof(net_socket_private,
108 		link));
109 	list_init_etc(&socket->connected_children, offsetof(net_socket_private,
110 		link));
111 
112 	status = get_domain_protocols(socket);
113 	if (status < B_OK)
114 		goto err2;
115 
116 	*_socket = socket;
117 	return B_OK;
118 
119 err2:
120 	benaphore_destroy(&socket->lock);
121 err1:
122 	delete socket;
123 	return status;
124 }
125 
126 
127 //	#pragma mark -
128 
129 
130 status_t
131 socket_open(int family, int type, int protocol, net_socket **_socket)
132 {
133 	net_socket_private *socket;
134 	status_t status = create_socket(family, type, protocol, &socket);
135 	if (status < B_OK)
136 		return status;
137 
138 	status = socket->first_info->open(socket->first_protocol);
139 	if (status < B_OK) {
140 		socket_delete(socket);
141 		return status;
142 	}
143 
144 	socket->owner = team_get_current_team_id();
145 
146 	benaphore_lock(&sSocketLock);
147 	list_add_item(&sSocketList, socket);
148 	benaphore_unlock(&sSocketLock);
149 
150 	*_socket = socket;
151 	return B_OK;
152 }
153 
154 
155 status_t
156 socket_close(net_socket *_socket)
157 {
158 	net_socket_private *socket = (net_socket_private *)_socket;
159 
160 	if (socket->select_pool) {
161 		// notify all pending selects
162 		notify_select_event_pool(socket->select_pool, ~0);
163 	}
164 
165 	return socket->first_info->close(socket->first_protocol);
166 }
167 
168 
169 status_t
170 socket_free(net_socket *socket)
171 {
172 	status_t status = socket->first_info->free(socket->first_protocol);
173 	if (status == B_BUSY)
174 		return B_OK;
175 
176 	socket_delete(socket);
177 	return B_OK;
178 }
179 
180 
181 status_t
182 socket_readv(net_socket *socket, const iovec *vecs, size_t vecCount,
183 	size_t *_length)
184 {
185 	return -1;
186 }
187 
188 
189 status_t
190 socket_writev(net_socket *socket, const iovec *vecs, size_t vecCount,
191 	size_t *_length)
192 {
193 	if (socket->peer.ss_len == 0)
194 		return ECONNRESET;
195 
196 	if (socket->address.ss_len == 0) {
197 		// try to bind first
198 		status_t status = socket_bind(socket, NULL, 0);
199 		if (status < B_OK)
200 			return status;
201 	}
202 
203 	// TODO: useful, maybe even computed header space!
204 	net_buffer *buffer = gNetBufferModule.create(256);
205 	if (buffer == NULL)
206 		return ENOBUFS;
207 
208 	// copy data into buffer
209 
210 	for (uint32 i = 0; i < vecCount; i++) {
211 		if (gNetBufferModule.append(buffer, vecs[i].iov_base,
212 				vecs[i].iov_len) < B_OK) {
213 			gNetBufferModule.free(buffer);
214 			return ENOBUFS;
215 		}
216 	}
217 
218 	memcpy(buffer->source, &socket->address, socket->address.ss_len);
219 	memcpy(buffer->destination, &socket->peer, socket->peer.ss_len);
220 	size_t size = buffer->size;
221 
222 	ssize_t bytesWritten = socket->first_info->send_data(socket->first_protocol,
223 		buffer);
224 	if (bytesWritten < B_OK) {
225 		if (buffer->size != size) {
226 			// this appears to be a partial write
227 			*_length = size - buffer->size;
228 		}
229 		gNetBufferModule.free(buffer);
230 		return bytesWritten;
231 	}
232 
233 	*_length = bytesWritten;
234 	return B_OK;
235 }
236 
237 
238 status_t
239 socket_control(net_socket *socket, int32 op, void *data, size_t length)
240 {
241 	return socket->first_info->control(socket->first_protocol,
242 		LEVEL_DRIVER_IOCTL, op, data, &length);
243 }
244 
245 
246 ssize_t
247 socket_read_avail(net_socket *socket)
248 {
249 	return socket->first_info->read_avail(socket->first_protocol);
250 }
251 
252 
253 ssize_t
254 socket_send_avail(net_socket *socket)
255 {
256 	return socket->first_info->send_avail(socket->first_protocol);
257 }
258 
259 
260 status_t
261 socket_send_data(net_socket *socket, net_buffer *buffer)
262 {
263 	return socket->first_info->send_data(socket->first_protocol,
264 		buffer);
265 }
266 
267 
268 status_t
269 socket_receive_data(net_socket *socket, size_t length, uint32 flags,
270 	net_buffer **_buffer)
271 {
272 	status_t status = socket->first_info->read_data(socket->first_protocol,
273 		length, flags, _buffer);
274 
275 	if (status < B_OK)
276 		return status;
277 
278 	if (*_buffer && length < (*_buffer)->size) {
279 		// discard any data behind the amount requested
280 		gNetBufferModule.trim(*_buffer, length);
281 	}
282 
283 	return status;
284 }
285 
286 
287 status_t
288 socket_get_next_stat(uint32 *_cookie, int family, struct net_stat *stat)
289 {
290 	BenaphoreLocker locker(sSocketLock);
291 
292 	net_socket_private *socket = NULL;
293 	uint32 cookie = *_cookie;
294 	uint32 count = 0;
295 	while ((socket = (net_socket_private *)list_get_next_item(&sSocketList,
296 			socket)) != NULL) {
297 		// TODO: also traverse the pending connections
298 		if (count == cookie)
299 			break;
300 
301 		if (family == -1 || family == socket->family)
302 			count++;
303 	}
304 
305 	if (socket == NULL)
306 		return B_ENTRY_NOT_FOUND;
307 
308 	*_cookie = count + 1;
309 
310 	stat->family = socket->family;
311 	stat->type = socket->type;
312 	stat->protocol = socket->protocol;
313 	stat->owner = socket->owner;
314 	stat->state[0] = '\0';
315 	memcpy(&stat->address, &socket->address, sizeof(struct sockaddr_storage));
316 	memcpy(&stat->peer, &socket->peer, sizeof(struct sockaddr_storage));
317 	stat->receive_queue_size = 0;
318 	stat->send_queue_size = 0;
319 
320 	// fill in protocol specific data (if supported by the protocol)
321 	size_t length = sizeof(net_stat);
322 	socket->first_info->control(socket->first_protocol, socket->protocol,
323 		NET_STAT_SOCKET, stat, &length);
324 
325 	return B_OK;
326 }
327 
328 
329 //	#pragma mark - connections
330 
331 
332 status_t
333 socket_spawn_pending(net_socket *_parent, net_socket **_socket)
334 {
335 	net_socket_private *parent = (net_socket_private *)_parent;
336 
337 	BenaphoreLocker locker(parent->lock);
338 
339 	// We actually accept more pending connections to compensate for those
340 	// that never complete, and also make sure at least a single connection
341 	// can always be accepted
342 	if (parent->child_count > 3 * parent->max_backlog / 2)
343 		return ENOBUFS;
344 
345 	net_socket_private *socket;
346 	status_t status = create_socket(parent->family, parent->type,
347 		parent->protocol, &socket);
348 	if (status < B_OK)
349 		return status;
350 
351 	// inherit parent's properties
352 	socket->send = parent->send;
353 	socket->receive = parent->receive;
354 	socket->options = parent->options & ~SO_ACCEPTCONN;
355 	socket->linger = parent->linger;
356 	socket->owner = parent->owner;
357 	memcpy(&socket->address, &parent->address, parent->address.ss_len);
358 	memcpy(&socket->peer, &parent->peer, parent->peer.ss_len);
359 
360 	// add to the parent's list of pending connections
361 	list_add_item(&parent->pending_children, socket);
362 	socket->parent = parent;
363 	parent->child_count++;
364 
365 	*_socket = socket;
366 	return B_OK;
367 }
368 
369 
370 void
371 socket_delete(net_socket *_socket)
372 {
373 	net_socket_private *socket = (net_socket_private *)_socket;
374 
375 	if (socket->parent != NULL)
376 		panic("socket still has a parent!");
377 
378 	benaphore_lock(&sSocketLock);
379 	list_remove_item(&sSocketList, socket);
380 	benaphore_unlock(&sSocketLock);
381 
382 	// also delete all children of this socket
383 	delete_children(&socket->pending_children);
384 	delete_children(&socket->connected_children);
385 
386 	put_domain_protocols(socket);
387 	benaphore_destroy(&socket->lock);
388 	delete_select_sync_pool(socket->select_pool);
389 	delete socket;
390 }
391 
392 
393 status_t
394 socket_dequeue_connected(net_socket *_parent, net_socket **_socket)
395 {
396 	net_socket_private *parent = (net_socket_private *)_parent;
397 
398 	benaphore_lock(&parent->lock);
399 
400 	net_socket_private *socket = (net_socket_private *)list_remove_head_item(
401 		&parent->connected_children);
402 	if (socket != NULL) {
403 		socket->parent = NULL;
404 		parent->child_count--;
405 		*_socket = socket;
406 	}
407 
408 	benaphore_unlock(&parent->lock);
409 
410 	if (socket == NULL)
411 		return B_ENTRY_NOT_FOUND;
412 
413 	benaphore_lock(&sSocketLock);
414 	list_add_item(&sSocketList, socket);
415 	benaphore_unlock(&sSocketLock);
416 
417 	return B_OK;
418 }
419 
420 
421 ssize_t
422 socket_count_connected(net_socket *_parent)
423 {
424 	net_socket_private *parent = (net_socket_private *)_parent;
425 
426 	BenaphoreLocker _(parent->lock);
427 
428 	ssize_t count = 0;
429 	void *item = NULL;
430 	while ((item = list_get_next_item(&parent->connected_children,
431 			item)) != NULL) {
432 		count++;
433 	}
434 
435 	return count;
436 }
437 
438 
439 status_t
440 socket_set_max_backlog(net_socket *_socket, uint32 backlog)
441 {
442 	net_socket_private *socket = (net_socket_private *)_socket;
443 
444 	// we enforce an upper limit of connections waiting to be accepted
445 	if (backlog > 256)
446 		backlog = 256;
447 
448 	benaphore_lock(&socket->lock);
449 
450 	// first remove the pending connections, then the already connected
451 	// ones as needed
452 	net_socket_private *child;
453 	while (socket->child_count > backlog
454 		&& (child = (net_socket_private *)list_remove_tail_item(
455 				&socket->pending_children)) != NULL) {
456 		child->parent = NULL;
457 		socket->child_count--;
458 	}
459 	while (socket->child_count > backlog
460 		&& (child = (net_socket_private *)list_remove_tail_item(
461 				&socket->connected_children)) != NULL) {
462 		child->parent = NULL;
463 		socket_delete(child);
464 		socket->child_count--;
465 	}
466 
467 	socket->max_backlog = backlog;
468 	benaphore_unlock(&socket->lock);
469 	return B_OK;
470 }
471 
472 
473 /*!
474 	The socket has been connected. It will be moved to the connected queue
475 	of its parent socket.
476 */
477 status_t
478 socket_connected(net_socket *socket)
479 {
480 	net_socket_private *parent = (net_socket_private *)socket->parent;
481 	if (parent == NULL)
482 		return B_BAD_VALUE;
483 
484 	benaphore_lock(&parent->lock);
485 
486 	list_remove_item(&parent->pending_children, socket);
487 	list_add_item(&parent->connected_children, socket);
488 
489 	// notify parent
490 	if (parent->select_pool)
491 		notify_select_event_pool(parent->select_pool, B_SELECT_READ);
492 
493 	benaphore_unlock(&parent->lock);
494 	return B_OK;
495 }
496 
497 
498 //	#pragma mark - notifications
499 
500 
501 status_t
502 socket_request_notification(net_socket *_socket, uint8 event, uint32 ref,
503 	selectsync *sync)
504 {
505 	net_socket_private *socket = (net_socket_private *)_socket;
506 
507 	benaphore_lock(&socket->lock);
508 
509 	status_t status = add_select_sync_pool_entry(&socket->select_pool, sync,
510 		event);
511 
512 	benaphore_unlock(&socket->lock);
513 
514 	if (status < B_OK)
515 		return status;
516 
517 	// check if the event is already present
518 	// TODO: add support for poll() types
519 
520 	switch (event) {
521 		case B_SELECT_READ:
522 		{
523 			ssize_t available = socket_read_avail(socket);
524 			if ((ssize_t)socket->receive.low_water_mark <= available
525 				|| available < B_OK)
526 				notify_select_event(sync, event);
527 			break;
528 		}
529 		case B_SELECT_WRITE:
530 		{
531 			ssize_t available = socket_send_avail(socket);
532 			if ((ssize_t)socket->send.low_water_mark <= available
533 				|| available < B_OK)
534 				notify_select_event(sync, event);
535 			break;
536 		}
537 		case B_SELECT_ERROR:
538 			// TODO: B_SELECT_ERROR condition!
539 			break;
540 	}
541 
542 	return B_OK;
543 }
544 
545 
546 status_t
547 socket_cancel_notification(net_socket *_socket, uint8 event, selectsync *sync)
548 {
549 	net_socket_private *socket = (net_socket_private *)_socket;
550 
551 	benaphore_lock(&socket->lock);
552 
553 	status_t status = remove_select_sync_pool_entry(&socket->select_pool,
554 		sync, event);
555 
556 	benaphore_unlock(&socket->lock);
557 	return status;
558 }
559 
560 
561 status_t
562 socket_notify(net_socket *_socket, uint8 event, int32 value)
563 {
564 	net_socket_private *socket = (net_socket_private *)_socket;
565 	bool notify = true;
566 
567 	switch (event) {
568 		case B_SELECT_READ:
569 			if ((ssize_t)socket->receive.low_water_mark > value
570 				&& value >= B_OK)
571 				notify = false;
572 			break;
573 
574 		case B_SELECT_WRITE:
575 			if ((ssize_t)socket->send.low_water_mark > value
576 				&& value >= B_OK)
577 				notify = false;
578 			break;
579 
580 		case B_SELECT_ERROR:
581 			socket->error = value;
582 			break;
583 	}
584 
585 	benaphore_lock(&socket->lock);
586 
587 	if (notify && socket->select_pool)
588 		notify_select_event_pool(socket->select_pool, event);
589 
590 	benaphore_unlock(&socket->lock);
591 	return B_OK;
592 }
593 
594 
595 //	#pragma mark - standard socket API
596 
597 
598 int
599 socket_accept(net_socket *socket, struct sockaddr *address,
600 	socklen_t *_addressLength, net_socket **_acceptedSocket)
601 {
602 	if ((socket->options & SO_ACCEPTCONN) == 0)
603 		return B_BAD_VALUE;
604 
605 	net_socket *accepted;
606 	status_t status = socket->first_info->accept(socket->first_protocol,
607 		&accepted);
608 	if (status < B_OK)
609 		return status;
610 
611 	if (address && *_addressLength > 0) {
612 		memcpy(address, &accepted->peer, min_c(*_addressLength,
613 			min_c(accepted->peer.ss_len, sizeof(sockaddr_storage))));
614 		*_addressLength = accepted->peer.ss_len;
615 	}
616 
617 	*_acceptedSocket = accepted;
618 	return B_OK;
619 }
620 
621 
622 int
623 socket_bind(net_socket *socket, const struct sockaddr *address,
624 	socklen_t addressLength)
625 {
626 	sockaddr empty;
627 	if (address == NULL) {
628 		// special - try to bind to an empty address, like INADDR_ANY
629 		memset(&empty, 0, sizeof(sockaddr));
630 		empty.sa_len = sizeof(sockaddr);
631 		empty.sa_family = socket->family;
632 
633 		address = &empty;
634 		addressLength = sizeof(sockaddr);
635 	}
636 
637 	if (socket->address.ss_len != 0) {
638 		status_t status = socket->first_info->unbind(socket->first_protocol,
639 			(sockaddr *)&socket->address);
640 		if (status < B_OK)
641 			return status;
642 	}
643 
644 	memcpy(&socket->address, address, sizeof(sockaddr));
645 
646 	status_t status = socket->first_info->bind(socket->first_protocol,
647 		(sockaddr *)address);
648 	if (status < B_OK) {
649 		// clear address again, as binding failed
650 		socket->address.ss_len = 0;
651 	}
652 
653 	return status;
654 }
655 
656 
657 int
658 socket_connect(net_socket *socket, const struct sockaddr *address,
659 	socklen_t addressLength)
660 {
661 	if (address == NULL || addressLength == 0)
662 		return ENETUNREACH;
663 
664 	if (socket->address.ss_len == 0) {
665 		// try to bind first
666 		status_t status = socket_bind(socket, NULL, 0);
667 		if (status < B_OK)
668 			return status;
669 	}
670 
671 	return socket->first_info->connect(socket->first_protocol, address);
672 }
673 
674 
675 int
676 socket_getpeername(net_socket *socket, struct sockaddr *address,
677 	socklen_t *_addressLength)
678 {
679 	if (socket->peer.ss_len == 0)
680 		return ENOTCONN;
681 
682 	memcpy(address, &socket->peer, min_c(*_addressLength, socket->peer.ss_len));
683 	*_addressLength = socket->peer.ss_len;
684 	return B_OK;
685 }
686 
687 
688 int
689 socket_getsockname(net_socket *socket, struct sockaddr *address,
690 	socklen_t *_addressLength)
691 {
692 	if (socket->address.ss_len == 0)
693 		return ENOTCONN;
694 
695 	memcpy(address, &socket->address, min_c(*_addressLength,
696 		socket->address.ss_len));
697 	*_addressLength = socket->address.ss_len;
698 	return B_OK;
699 }
700 
701 
702 status_t
703 socket_get_option(net_socket *socket, int level, int option, void *value,
704 	int *_length)
705 {
706 	if (level != SOL_SOCKET)
707 		return ENOPROTOOPT;
708 
709 	switch (option) {
710 		case SO_SNDBUF:
711 		{
712 			uint32 *size = (uint32 *)value;
713 			*size = socket->send.buffer_size;
714 			*_length = sizeof(uint32);
715 			return B_OK;
716 		}
717 
718 		case SO_RCVBUF:
719 		{
720 			uint32 *size = (uint32 *)value;
721 			*size = socket->receive.buffer_size;
722 			*_length = sizeof(uint32);
723 			return B_OK;
724 		}
725 
726 		case SO_SNDLOWAT:
727 		{
728 			uint32 *size = (uint32 *)value;
729 			*size = socket->send.low_water_mark;
730 			*_length = sizeof(uint32);
731 			return B_OK;
732 		}
733 
734 		case SO_RCVLOWAT:
735 		{
736 			uint32 *size = (uint32 *)value;
737 			*size = socket->receive.low_water_mark;
738 			*_length = sizeof(uint32);
739 			return B_OK;
740 		}
741 
742 		case SO_RCVTIMEO:
743 		case SO_SNDTIMEO:
744 		{
745 			if (*_length < (int)sizeof(struct timeval))
746 				return B_BAD_VALUE;
747 
748 			bigtime_t timeout;
749 			if (option == SO_SNDTIMEO)
750 				timeout = socket->send.timeout;
751 			else
752 				timeout = socket->receive.timeout;
753 			if (timeout == B_INFINITE_TIMEOUT)
754 				timeout = 0;
755 
756 			struct timeval *timeval = (struct timeval *)value;
757 			timeval->tv_sec = timeout / 1000000LL;
758 			timeval->tv_usec = timeout % 1000000LL;
759 
760 			*_length = sizeof(struct timeval);
761 			return B_OK;
762 		}
763 
764 		case SO_NONBLOCK:
765 		{
766 			int32 *_set = (int32 *)value;
767 			*_set = socket->receive.timeout == 0 && socket->send.timeout == 0;
768 			*_length = sizeof(int32);
769 			return B_OK;
770 		}
771 
772 		case SO_ACCEPTCONN:
773 		case SO_BROADCAST:
774 		case SO_DEBUG:
775 		case SO_DONTROUTE:
776 		case SO_KEEPALIVE:
777 		case SO_OOBINLINE:
778 		case SO_REUSEADDR:
779 		case SO_REUSEPORT:
780 		case SO_USELOOPBACK:
781 		{
782 			int32 *_set = (int32 *)value;
783 			*_set = (socket->options & option) != 0;
784 			*_length = sizeof(int32);
785 			return B_OK;
786 		}
787 
788 		case SO_ERROR:
789 		{
790 			int32 *_set = (int32 *)value;
791 			*_set = socket->error;
792 			*_length = sizeof(int32);
793 
794 			socket->error = B_OK;
795 				// clear error upon retrieval
796 			return B_OK;
797 		}
798 
799 		default:
800 			break;
801 	}
802 
803 	dprintf("socket_getsockopt: unknown option %d\n", option);
804 	return ENOPROTOOPT;
805 }
806 
807 
808 int
809 socket_getsockopt(net_socket *socket, int level, int option, void *value,
810 	int *_length)
811 {
812 	return socket->first_protocol->module->getsockopt(socket->first_protocol,
813 		level, option, value, _length);
814 }
815 
816 
817 int
818 socket_listen(net_socket *socket, int backlog)
819 {
820 	status_t status = socket->first_info->listen(socket->first_protocol,
821 		backlog);
822 	if (status == B_OK)
823 		socket->options |= SO_ACCEPTCONN;
824 
825 	return status;
826 }
827 
828 
829 ssize_t
830 socket_receive(net_socket *socket, msghdr *header, void *data, size_t length,
831 	int flags)
832 {
833 	size_t totalLength = length;
834 	net_buffer *buffer;
835 	iovec tmp;
836 	int i;
837 
838 	// the convention to this function is that have header been
839 	// present, { data, length } would have been iovec[0] and is
840 	// always considered like that
841 
842 	if (header) {
843 		// calculate the length considering all of the extra buffers
844 		for (i = 1; i < header->msg_iovlen; i++) {
845 			if (user_memcpy(&tmp, header->msg_iov + i, sizeof(iovec)) < B_OK)
846 				return B_BAD_ADDRESS;
847 			if (tmp.iov_len > 0 && tmp.iov_base == NULL)
848 				return B_BAD_ADDRESS;
849 			totalLength += tmp.iov_len;
850 		}
851 	}
852 
853 	status_t status = socket->first_info->read_data(
854 		socket->first_protocol, totalLength, flags, &buffer);
855 	if (status < B_OK)
856 		return status;
857 
858 	// TODO: - returning a NULL buffer when received 0 bytes
859 	//         may not make much sense as we still need the address
860 	//       - gNetBufferModule.read() uses memcpy() instead of user_memcpy
861 
862 	size_t nameLen = 0;
863 
864 	if (header) {
865 		// TODO: - consider the control buffer options
866 		nameLen = header->msg_namelen;
867 		header->msg_namelen = 0;
868 		header->msg_flags = 0;
869 	}
870 
871 	if (buffer == NULL)
872 		return 0;
873 
874 	size_t bytesReceived = buffer->size, bytesCopied = 0;
875 
876 	length = min_c(bytesReceived, length);
877 	if (gNetBufferModule.read(buffer, 0, data, length) < B_OK) {
878 		gNetBufferModule.free(buffer);
879 		return ENOBUFS;
880 	}
881 
882 	// if first copy was a success, proceed to following
883 	// copies as required
884 	bytesCopied += length;
885 
886 	if (header) {
887 		// we only start considering at iovec[1]
888 		// as { data, length } is iovec[0]
889 		for (i = 1; i < header->msg_iovlen && bytesCopied < bytesReceived; i++) {
890 			if (user_memcpy(&tmp, header->msg_iov + i, sizeof(iovec)) < B_OK)
891 				break;
892 
893 			size_t toRead = min_c(bytesReceived - bytesCopied, tmp.iov_len);
894 			if (gNetBufferModule.read(buffer, bytesCopied, tmp.iov_base,
895 										toRead) < B_OK)
896 				break;
897 
898 			bytesCopied += toRead;
899 		}
900 
901 		if (header->msg_name != NULL) {
902 			header->msg_namelen = min_c(nameLen, buffer->source->sa_len);
903 			memcpy(header->msg_name, buffer->source, header->msg_namelen);
904 		}
905 	}
906 
907 	gNetBufferModule.free(buffer);
908 
909 	if (bytesCopied < bytesReceived) {
910 		if (header)
911 			header->msg_flags = MSG_TRUNC;
912 
913 		if (flags & MSG_TRUNC)
914 			return bytesReceived;
915 	}
916 
917 	return bytesCopied;
918 }
919 
920 
921 ssize_t
922 socket_send(net_socket *socket, msghdr *header, const void *data, size_t length,
923 	int flags)
924 {
925 	const sockaddr *address = NULL;
926 	socklen_t addressLength = 0;
927 	size_t bytesLeft = length;
928 
929 	if (length > SSIZE_MAX)
930 		return B_BAD_VALUE;
931 
932 	// the convention to this function is that have header been
933 	// present, { data, length } would have been iovec[0] and is
934 	// always considered like that
935 
936 	if (header != NULL) {
937 		address = (const sockaddr *)header->msg_name;
938 		addressLength = header->msg_namelen;
939 
940 		if (header->msg_iovlen <= 1)
941 			header = NULL;
942 		else {
943 			bytesLeft += compute_user_iovec_length(header->msg_iov + 1,
944 				header->msg_iovlen - 1);
945 		}
946 	}
947 
948 	if (addressLength == 0)
949 		address = NULL;
950 	else if (addressLength != 0 && address == NULL)
951 		return B_BAD_VALUE;
952 
953 	if (socket->peer.ss_len != 0) {
954 		if (address != NULL)
955 			return EISCONN;
956 
957 		// socket is connected, we use that address
958 		address = (struct sockaddr *)&socket->peer;
959 		addressLength = socket->peer.ss_len;
960 	}
961 
962 	if (address == NULL || addressLength == 0) {
963 		// don't know where to send to:
964 		return EDESTADDRREQ;
965 	}
966 
967 	if ((socket->first_info->flags & NET_PROTOCOL_ATOMIC_MESSAGES) != 0
968 		&& bytesLeft > socket->send.buffer_size)
969 		return EMSGSIZE;
970 
971 	if (socket->address.ss_len == 0) {
972 		// try to bind first
973 		status_t status = socket_bind(socket, NULL, 0);
974 		if (status < B_OK)
975 			return status;
976 	}
977 
978 	ssize_t bytesSent = 0;
979 	size_t vecOffset = 0;
980 	uint32 vecIndex = 0;
981 
982 	while (bytesLeft > 0) {
983 		// TODO: useful, maybe even computed header space!
984 		net_buffer *buffer = gNetBufferModule.create(256);
985 		if (buffer == NULL)
986 			return ENOBUFS;
987 
988 		while (buffer->size < socket->send.buffer_size
989 			&& buffer->size < bytesLeft) {
990 			if (vecIndex > 0 && vecOffset == 0) {
991 				// retrieve next iovec buffer from header
992 				iovec vec;
993 				if (user_memcpy(&vec, header->msg_iov + vecIndex, sizeof(iovec))
994 						< B_OK) {
995 					gNetBufferModule.free(buffer);
996 					return B_BAD_ADDRESS;
997 				}
998 
999 				data = vec.iov_base;
1000 				length = vec.iov_len;
1001 			}
1002 
1003 			size_t bytes = length;
1004 			if (buffer->size + bytes > socket->send.buffer_size)
1005 				bytes = socket->send.buffer_size - buffer->size;
1006 
1007 			if (gNetBufferModule.append(buffer, data, bytes) < B_OK) {
1008 				gNetBufferModule.free(buffer);
1009 				return ENOBUFS;
1010 			}
1011 
1012 			if (bytes != length) {
1013 				// partial send
1014 				vecOffset = bytes;
1015 				length -= vecOffset;
1016 				data = (uint8 *)data + vecOffset;
1017 			} else if (header != NULL) {
1018 				// proceed with next buffer, if any
1019 				vecOffset = 0;
1020 				vecIndex++;
1021 
1022 				if (vecIndex >= (uint32)header->msg_iovlen)
1023 					break;
1024 			}
1025 		}
1026 
1027 		size_t bufferSize = buffer->size;
1028 		buffer->flags = flags;
1029 		memcpy(buffer->source, &socket->address, socket->address.ss_len);
1030 		memcpy(buffer->destination, address, addressLength);
1031 
1032 		status_t status = socket->first_info->send_data(socket->first_protocol,
1033 			buffer);
1034 		if (status < B_OK) {
1035 			size_t sizeAfterSend = buffer->size;
1036 			gNetBufferModule.free(buffer);
1037 
1038 			if (sizeAfterSend != bufferSize
1039 				&& (status == B_INTERRUPTED || status == B_WOULD_BLOCK)) {
1040 				// this appears to be a partial write
1041 				return bytesSent + (bufferSize - sizeAfterSend);
1042 			}
1043 			return status;
1044 		}
1045 
1046 		bytesLeft -= bufferSize;
1047 		bytesSent += bufferSize;
1048 	}
1049 
1050 	return bytesSent;
1051 }
1052 
1053 
1054 status_t
1055 socket_set_option(net_socket *socket, int level, int option, const void *value,
1056 	int length)
1057 {
1058 	if (level != SOL_SOCKET)
1059 		return ENOPROTOOPT;
1060 
1061 	switch (option) {
1062 		// TODO: implement other options!
1063 		case SO_LINGER:
1064 		{
1065 			if (length < (int)sizeof(struct linger))
1066 				return B_BAD_VALUE;
1067 
1068 			struct linger *linger = (struct linger *)value;
1069 			if (linger->l_onoff) {
1070 				socket->options |= SO_LINGER;
1071 				socket->linger = linger->l_linger;
1072 			} else {
1073 				socket->options &= ~SO_LINGER;
1074 				socket->linger = 0;
1075 			}
1076 			return B_OK;
1077 		}
1078 
1079 		case SO_SNDBUF:
1080 			if (length != sizeof(uint32))
1081 				return B_BAD_VALUE;
1082 
1083 			socket->send.buffer_size = *(const uint32 *)value;
1084 			return B_OK;
1085 
1086 		case SO_RCVBUF:
1087 			if (length != sizeof(uint32))
1088 				return B_BAD_VALUE;
1089 
1090 			socket->receive.buffer_size = *(const uint32 *)value;
1091 			return B_OK;
1092 
1093 		case SO_SNDLOWAT:
1094 			if (length != sizeof(uint32))
1095 				return B_BAD_VALUE;
1096 
1097 			socket->send.low_water_mark = *(const uint32 *)value;
1098 			return B_OK;
1099 
1100 		case SO_RCVLOWAT:
1101 			if (length != sizeof(uint32))
1102 				return B_BAD_VALUE;
1103 
1104 			socket->receive.low_water_mark = *(const uint32 *)value;
1105 			return B_OK;
1106 
1107 		case SO_RCVTIMEO:
1108 		case SO_SNDTIMEO:
1109 		{
1110 			if (length != sizeof(struct timeval))
1111 				return B_BAD_VALUE;
1112 
1113 			const struct timeval *timeval = (const struct timeval *)value;
1114 			bigtime_t timeout = timeval->tv_sec * 1000000LL + timeval->tv_usec;
1115 			if (timeout == 0)
1116 				timeout = B_INFINITE_TIMEOUT;
1117 
1118 			if (option == SO_SNDTIMEO)
1119 				socket->send.timeout = timeout;
1120 			else
1121 				socket->receive.timeout = timeout;
1122 			return B_OK;
1123 		}
1124 
1125 		case SO_NONBLOCK:
1126 			if (length != sizeof(int32))
1127 				return B_BAD_VALUE;
1128 
1129 			if (*(const int32 *)value) {
1130 				socket->send.timeout = 0;
1131 				socket->receive.timeout = 0;
1132 			} else {
1133 				socket->send.timeout = B_INFINITE_TIMEOUT;
1134 				socket->receive.timeout = B_INFINITE_TIMEOUT;
1135 			}
1136 			return B_OK;
1137 
1138 		case SO_BROADCAST:
1139 		case SO_DEBUG:
1140 		case SO_DONTROUTE:
1141 		case SO_KEEPALIVE:
1142 		case SO_OOBINLINE:
1143 		case SO_REUSEADDR:
1144 		case SO_REUSEPORT:
1145 		case SO_USELOOPBACK:
1146 			if (length != sizeof(int32))
1147 				return B_BAD_VALUE;
1148 
1149 			if (*(const int32 *)value)
1150 				socket->options |= option;
1151 			else
1152 				socket->options &= ~option;
1153 			return B_OK;
1154 
1155 		default:
1156 			break;
1157 	}
1158 
1159 	dprintf("socket_setsockopt: unknown option %d\n", option);
1160 	return ENOPROTOOPT;
1161 }
1162 
1163 
1164 int
1165 socket_setsockopt(net_socket *socket, int level, int option, const void *value,
1166 	int length)
1167 {
1168 	return socket->first_protocol->module->setsockopt(socket->first_protocol,
1169 		level, option, value, length);
1170 }
1171 
1172 
1173 int
1174 socket_shutdown(net_socket *socket, int direction)
1175 {
1176 	return socket->first_info->shutdown(socket->first_protocol, direction);
1177 }
1178 
1179 
1180 //	#pragma mark -
1181 
1182 
1183 static status_t
1184 socket_std_ops(int32 op, ...)
1185 {
1186 	switch (op) {
1187 		case B_MODULE_INIT:
1188 		{
1189 			// TODO: this is currently done in the net_stack driver
1190 			// initialize the main stack if not done so already
1191 			//module_info *module;
1192 			//return get_module(NET_STARTER_MODULE_NAME, &module);
1193 			list_init_etc(&sSocketList, offsetof(net_socket_private, link));
1194 			return benaphore_init(&sSocketLock, "socket list");
1195 		}
1196 		case B_MODULE_UNINIT:
1197 			//return put_module(NET_STARTER_MODULE_NAME);
1198 			benaphore_destroy(&sSocketLock);
1199 			return B_OK;
1200 
1201 		default:
1202 			return B_ERROR;
1203 	}
1204 }
1205 
1206 
1207 net_socket_module_info gNetSocketModule = {
1208 	{
1209 		NET_SOCKET_MODULE_NAME,
1210 		0,
1211 		socket_std_ops
1212 	},
1213 	socket_open,
1214 	socket_close,
1215 	socket_free,
1216 
1217 	socket_readv,
1218 	socket_writev,
1219 	socket_control,
1220 
1221 	socket_read_avail,
1222 	socket_send_avail,
1223 
1224 	socket_send_data,
1225 	socket_receive_data,
1226 
1227 	socket_get_option,
1228 	socket_set_option,
1229 
1230 	socket_get_next_stat,
1231 
1232 	// connections
1233 	socket_spawn_pending,
1234 	socket_delete,
1235 	socket_dequeue_connected,
1236 	socket_count_connected,
1237 	socket_set_max_backlog,
1238 	socket_connected,
1239 
1240 	// notifications
1241 	socket_request_notification,
1242 	socket_cancel_notification,
1243 	socket_notify,
1244 
1245 	// standard socket API
1246 	socket_accept,
1247 	socket_bind,
1248 	socket_connect,
1249 	socket_getpeername,
1250 	socket_getsockname,
1251 	socket_getsockopt,
1252 	socket_listen,
1253 	socket_receive,
1254 	socket_send,
1255 	socket_setsockopt,
1256 	socket_shutdown,
1257 };
1258 
1259