xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 6e927a5fc080cb934e7584454f472cacf4c3e361)
1 /*
2  * Copyright 2006-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 #include "EndpointManager.h"
14 
15 #include <net_buffer.h>
16 #include <net_datalink.h>
17 #include <net_stat.h>
18 #include <NetBufferUtilities.h>
19 #include <NetUtilities.h>
20 
21 #include <lock.h>
22 #include <util/AutoLock.h>
23 #include <util/khash.h>
24 #include <util/list.h>
25 
26 #include <KernelExport.h>
27 #include <Select.h>
28 
29 #include <netinet/in.h>
30 #include <netinet/ip.h>
31 #include <netinet/tcp.h>
32 #include <new>
33 #include <stdlib.h>
34 #include <string.h>
35 
36 
37 // References:
38 //  - RFC 793 - Transmission Control Protocol
39 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
40 //
41 // Things this implementation currently doesn't implement:
42 //
43 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042
44 // NewReno Modification to TCP's Fast Recovery, RFC 2582
45 // Explicit Congestion Notification (ECN), RFC 3168
46 // SYN-Cache
47 // TCP Extensions for High Performance, RFC 1323
48 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
49 // Forward RTO-Recovery, RFC 4138
50 
51 #define PrintAddress(address) \
52 	AddressString(Domain(), address, true).Data()
53 
54 //#define TRACE_TCP
55 //#define PROBE_TCP
56 
57 #ifdef TRACE_TCP
58 // the space before ', ##args' is important in order for this to work with cpp 2.95
59 #	define TRACE(format, args...)	dprintf("TCP [%llu] %p (%12s) " format "\n", \
60 		system_time(), this, name_for_state(fState) , ##args)
61 #else
62 #	define TRACE(args...)			do { } while (0)
63 #endif
64 
65 #ifdef PROBE_TCP
66 #	define PROBE(buffer, window) \
67 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
68 		system_time(), PrintAddress(buffer->source), \
69 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
70 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
71 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
72 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
73 #else
74 #	define PROBE(buffer, window)	do { } while (0)
75 #endif
76 
77 // Initial estimate for packet round trip time (RTT)
78 #define TCP_INITIAL_RTT		2000000
79 
80 // constants for the fFlags field
81 enum {
82 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
83 	FLAG_OPTION_TIMESTAMP		= 0x02,
84 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
85 	//       That is, what is expected from accept() after a shutdown()
86 	//       is performed on a listen()ing socket.
87 	FLAG_NO_RECEIVE				= 0x04,
88 };
89 
90 
91 static const int kTimestampFactor = 1024;
92 
93 
94 static inline bigtime_t
95 absolute_timeout(bigtime_t timeout)
96 {
97 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
98 		return timeout;
99 
100 	return timeout + system_time();
101 }
102 
103 
104 static inline status_t
105 posix_error(status_t error)
106 {
107 	if (error == B_TIMED_OUT)
108 		return B_WOULD_BLOCK;
109 
110 	return error;
111 }
112 
113 
114 static inline bool
115 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext,
116 	uint32 rcvWindow)
117 {
118 	return sequence >= rcvNext && sequence < (rcvNext + rcvWindow);
119 }
120 
121 
122 static inline bool
123 segment_in_sequence(const tcp_segment_header &segment, int size,
124 	const tcp_sequence &rcvNext, uint32 rcvWindow)
125 {
126 	tcp_sequence sequence(segment.sequence);
127 	if (size == 0) {
128 		if (rcvWindow == 0)
129 			return sequence == rcvNext;
130 		return in_window(sequence, rcvNext, rcvWindow);
131 	} else {
132 		if (rcvWindow == 0)
133 			return false;
134 		return in_window(sequence, rcvNext, rcvWindow)
135 			|| in_window(sequence + size - 1, rcvNext, rcvWindow);
136 	}
137 }
138 
139 
140 static inline bool
141 is_writable(tcp_state state)
142 {
143 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
144 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
145 }
146 
147 
148 static inline uint32 tcp_now()
149 {
150 	return system_time() / kTimestampFactor;
151 }
152 
153 
154 static inline uint32 tcp_diff_timestamp(uint32 base)
155 {
156 	uint32 now = tcp_now();
157 
158 	if (now > base)
159 		return now - base;
160 
161 	return now + UINT_MAX - base;
162 }
163 
164 
165 static inline bool
166 state_needs_finish(int32 state)
167 {
168 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
169 		|| state == FINISH_SENT || state == CLOSING;
170 }
171 
172 
173 WaitList::WaitList(const char *name)
174 {
175 	fCondition = 0;
176 	fSem = create_sem(0, name);
177 }
178 
179 
180 WaitList::~WaitList()
181 {
182 	delete_sem(fSem);
183 }
184 
185 
186 status_t
187 WaitList::InitCheck() const
188 {
189 	return fSem;
190 }
191 
192 
193 status_t
194 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext)
195 {
196 	locker.Unlock();
197 
198 	status_t status = B_OK;
199 
200 	while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) {
201 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
202 			timeout);
203 	}
204 
205 	locker.Lock();
206 	if (status == B_OK && wakeNext)
207 		Signal();
208 
209 	return status;
210 }
211 
212 
213 void
214 WaitList::Signal()
215 {
216 	atomic_or(&fCondition, 1);
217 #ifdef __HAIKU__
218 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY);
219 #else
220 	int32 count;
221 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
222 		release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE);
223 #endif
224 }
225 
226 
227 TCPEndpoint::TCPEndpoint(net_socket *socket)
228 	:
229 	ProtocolSocket(socket),
230 	fManager(NULL),
231 	fReceiveList("tcp receive"),
232 	fSendList("tcp send"),
233 	fOptions(0),
234 	fSendWindowShift(0),
235 	fReceiveWindowShift(0),
236 	fSendUnacknowledged(0),
237 	fSendNext(0),
238 	fSendMax(0),
239 	fSendWindow(0),
240 	fSendMaxWindow(0),
241 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
242 	fSendQueue(socket->send.buffer_size),
243 	fInitialSendSequence(0),
244 	fDuplicateAcknowledgeCount(0),
245 	fRoute(NULL),
246 	fReceiveNext(0),
247 	fReceiveMaxAdvertised(0),
248 	fReceiveWindow(socket->receive.buffer_size),
249 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
250 	fReceiveQueue(socket->receive.buffer_size),
251 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
252 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
253 	fRetransmitTimeout(TCP_INITIAL_RTT),
254 	fReceivedTimestamp(0),
255 	fCongestionWindow(0),
256 	fSlowStartThreshold(0),
257 	fState(CLOSED),
258 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP),
259 	fError(B_OK)
260 {
261 	//gStackModule->init_timer(&fTimer, _TimeWait, this);
262 
263 	// TODO: to be replaced with a real locking strategy!
264 	mutex_init(&fLock, "tcp lock");
265 
266 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
267 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this);
268 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
269 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
270 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this);
271 }
272 
273 
274 TCPEndpoint::~TCPEndpoint()
275 {
276 	mutex_lock(&fLock);
277 
278 	gStackModule->cancel_timer(&fRetransmitTimer);
279 	gStackModule->cancel_timer(&fPersistTimer);
280 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
281 	gStackModule->cancel_timer(&fTimeWaitTimer);
282 
283 	if (fManager) {
284 		fManager->Unbind(this);
285 		return_endpoint_manager(fManager);
286 	}
287 
288 	mutex_destroy(&fLock);
289 }
290 
291 
292 status_t
293 TCPEndpoint::InitCheck() const
294 {
295 	if (fLock.sem < B_OK)
296 		return fLock.sem;
297 
298 	if (fReceiveList.InitCheck() < B_OK)
299 		return fReceiveList.InitCheck();
300 
301 	if (fSendList.InitCheck() < B_OK)
302 		return fSendList.InitCheck();
303 
304 	return B_OK;
305 }
306 
307 
308 //	#pragma mark - protocol API
309 
310 
311 status_t
312 TCPEndpoint::Open()
313 {
314 	TRACE("Open()");
315 
316 	status_t status = ProtocolSocket::Open();
317 	if (status < B_OK)
318 		return status;
319 
320 	fManager = create_endpoint_manager(Domain());
321 	if (fManager == NULL)
322 		return EAFNOSUPPORT;
323 
324 	return B_OK;
325 }
326 
327 
328 status_t
329 TCPEndpoint::Close()
330 {
331 	TRACE("Close()");
332 
333 	MutexLocker lock(fLock);
334 
335 	if (fState == LISTEN)
336 		delete_sem(fAcceptSemaphore);
337 
338 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
339 		fState = CLOSED;
340 		return B_OK;
341 	}
342 
343 	status_t status = _ShutdownEgress(true);
344 	if (status != B_OK)
345 		return status;
346 
347 	if (socket->options & SO_LINGER) {
348 		TRACE("Close(): Lingering for %i secs", socket->linger);
349 
350 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
351 
352 		while (fSendQueue.Used() > 0) {
353 			status = fSendList.Wait(lock, maximum);
354 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
355 				break;
356 			else if (status < B_OK)
357 				return status;
358 		}
359 
360 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
361 			fSendQueue.Used());
362 	}
363 
364 	// TODO: do i need to wait until fState returns to CLOSED?
365 	return B_OK;
366 }
367 
368 
369 status_t
370 TCPEndpoint::Free()
371 {
372 	TRACE("Free()");
373 
374 	MutexLocker _(fLock);
375 
376 	if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT)
377 		return B_OK;
378 
379 	// we are only interested in the timer, not in changing state
380 	_EnterTimeWait();
381 	return B_BUSY;
382 		// we'll be freed later when the 2MSL timer expires
383 }
384 
385 
386 /*!
387 	Creates and sends a synchronize packet to /a address, and then waits
388 	until the connection has been established or refused.
389 */
390 status_t
391 TCPEndpoint::Connect(const sockaddr *address)
392 {
393 	TRACE("Connect() on address %s", PrintAddress(address));
394 
395 	MutexLocker locker(fLock);
396 
397 	// Can only call connect() from CLOSED or LISTEN states
398 	// otherwise endpoint is considered already connected
399 	if (fState == LISTEN) {
400 		// this socket is about to connect; remove pending connections in the backlog
401 		gSocketModule->set_max_backlog(socket, 0);
402 	} else if (fState != CLOSED)
403 		return EISCONN;
404 
405 	status_t status = _PrepareSendPath(address);
406 	if (status < B_OK)
407 		return status;
408 
409 	TRACE("  Connect(): starting 3-way handshake...");
410 
411 	fState = SYNCHRONIZE_SENT;
412 
413 	// send SYN
414 	status = _SendQueued();
415 	if (status != B_OK) {
416 		fState = CLOSED;
417 		return status;
418 	}
419 
420 	// If we are running over Loopback, after _SendQueued() returns we
421 	// may be in ESTABLISHED already.
422 	if (fState == ESTABLISHED) {
423 		TRACE("  Connect() completed after _SendQueued()");
424 		return B_OK;
425 	}
426 
427 	// wait until 3-way handshake is complete (if needed)
428 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
429 	if (timeout == 0) {
430 		// we're a non-blocking socket
431 		TRACE("  Connect() delayed, return EINPROGRESS");
432 		return EINPROGRESS;
433 	}
434 
435 	status = _WaitForEstablished(locker, absolute_timeout(timeout));
436 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
437 		strerror(status), timeout);
438 	return posix_error(status);
439 }
440 
441 
442 status_t
443 TCPEndpoint::Accept(struct net_socket **_acceptedSocket)
444 {
445 	TRACE("Accept()");
446 
447 	MutexLocker locker(fLock);
448 
449 	status_t status;
450 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
451 
452 	do {
453 		locker.Unlock();
454 
455 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
456 			| B_CAN_INTERRUPT, timeout);
457 		if (status < B_OK)
458 			return status;
459 
460 		locker.Lock();
461 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
462 		if (status == B_OK)
463 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
464 	} while (status < B_OK);
465 
466 	return status;
467 }
468 
469 
470 status_t
471 TCPEndpoint::Bind(const sockaddr *address)
472 {
473 	if (address == NULL)
474 		return B_BAD_VALUE;
475 
476 	MutexLocker lock(fLock);
477 
478 	TRACE("Bind() on address %s", PrintAddress(address));
479 
480 	if (fState != CLOSED)
481 		return EISCONN;
482 
483 	return fManager->Bind(this, address);
484 }
485 
486 
487 status_t
488 TCPEndpoint::Unbind(struct sockaddr *address)
489 {
490 	TRACE("Unbind()");
491 
492 	MutexLocker lock(fLock);
493 	return fManager->Unbind(this);
494 }
495 
496 
497 status_t
498 TCPEndpoint::Listen(int count)
499 {
500 	TRACE("Listen()");
501 
502 	MutexLocker lock(fLock);
503 
504 	if (fState != CLOSED)
505 		return B_BAD_VALUE;
506 
507 	fAcceptSemaphore = create_sem(0, "tcp accept");
508 	if (fAcceptSemaphore < B_OK)
509 		return ENOBUFS;
510 
511 	status_t status = fManager->SetPassive(this);
512 	if (status < B_OK) {
513 		delete_sem(fAcceptSemaphore);
514 		fAcceptSemaphore = -1;
515 		return status;
516 	}
517 
518 	fState = LISTEN;
519 	return B_OK;
520 }
521 
522 
523 status_t
524 TCPEndpoint::Shutdown(int direction)
525 {
526 	TRACE("Shutdown(%i)", direction);
527 
528 	MutexLocker lock(fLock);
529 
530 	if (direction == SHUT_RD || direction == SHUT_RDWR)
531 		fFlags |= FLAG_NO_RECEIVE;
532 
533 	if (direction == SHUT_WR || direction == SHUT_RDWR)
534 		_ShutdownEgress(false);
535 
536 	return B_OK;
537 }
538 
539 
540 /*!
541 	Puts data contained in \a buffer into send buffer
542 */
543 status_t
544 TCPEndpoint::SendData(net_buffer *buffer)
545 {
546 	MutexLocker lock(fLock);
547 
548 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
549 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
550 		  fSendQueue.Free());
551 
552 	if (fState == CLOSED)
553 		return ENOTCONN;
554 	else if (fState == LISTEN) {
555 		return EDESTADDRREQ;
556 	} else if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED
557 				|| fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
558 				|| fState == TIME_WAIT) {
559 		// TODO: send SIGPIPE signal to app?
560 		return EPIPE;
561 	}
562 
563 	if (buffer->size > 0) {
564 		if (buffer->size > fSendQueue.Size())
565 			return EMSGSIZE;
566 
567 		bigtime_t timeout = absolute_timeout(socket->send.timeout);
568 
569 		while (fSendQueue.Free() < buffer->size) {
570 			status_t status = fSendList.Wait(lock, timeout);
571 			if (status < B_OK) {
572 				TRACE("  SendData() returning %s (%d)",
573 					strerror(posix_error(status)), (int)posix_error(status));
574 				return posix_error(status);
575 			}
576 		}
577 
578 		fSendQueue.Add(buffer);
579 	}
580 
581 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
582 
583 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
584 		_SendQueued();
585 
586 	return B_OK;
587 }
588 
589 
590 ssize_t
591 TCPEndpoint::SendAvailable()
592 {
593 	MutexLocker locker(fLock);
594 
595 	ssize_t available;
596 
597 	if (is_writable(fState))
598 		available = fSendQueue.Free();
599 	else
600 		available = EPIPE;
601 
602 	TRACE("SendAvailable(): %li", available);
603 	return available;
604 }
605 
606 
607 status_t
608 TCPEndpoint::FillStat(net_stat *stat)
609 {
610 	MutexLocker _(fLock);
611 
612 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
613 	stat->receive_queue_size = fReceiveQueue.Available();
614 	stat->send_queue_size = fSendQueue.Used();
615 
616 	return B_OK;
617 }
618 
619 
620 status_t
621 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
622 {
623 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
624 
625 	MutexLocker locker(fLock);
626 
627 	*_buffer = NULL;
628 
629 	if (fState == CLOSED)
630 		return ENOTCONN;
631 
632 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
633 
634 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
635 		if (flags & MSG_DONTWAIT)
636 			return B_WOULD_BLOCK;
637 
638 		status_t status = _WaitForEstablished(locker, timeout);
639 		if (status < B_OK)
640 			return posix_error(status);
641 	}
642 
643 	size_t dataNeeded = socket->receive.low_water_mark;
644 
645 	// When MSG_WAITALL is set then the function should block
646 	// until the full amount of data can be returned.
647 	if (flags & MSG_WAITALL)
648 		dataNeeded = numBytes;
649 
650 	// TODO: add support for urgent data (MSG_OOB)
651 
652 	while (true) {
653 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
654 				|| fState == TIME_WAIT) {
655 			// ``Connection closing''.
656 			return B_OK;
657 		}
658 
659 		if (fReceiveQueue.Available() > 0) {
660 			if (fReceiveQueue.Available() >= dataNeeded ||
661 				((fReceiveQueue.PushedData() > 0)
662 					&& (fReceiveQueue.PushedData() >= fReceiveQueue.Available())))
663 				break;
664 		} else if (fState == FINISH_RECEIVED) {
665 			// ``If no text is awaiting delivery, the RECEIVE will
666 			//   get a Connection closing''.
667 			return B_OK;
668 		}
669 
670 		if ((flags & MSG_DONTWAIT) || (socket->receive.timeout == 0))
671 			return B_WOULD_BLOCK;
672 
673 		status_t status = fReceiveList.Wait(locker, timeout, false);
674 		if (status < B_OK) {
675 			// The Open Group base specification mentions that EINTR should be
676 			// returned if the recv() is interrupted before _any data_ is
677 			// available. So we actually check if there is data, and if so,
678 			// push it to the user.
679 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
680 					&& fReceiveQueue.Available() > 0)
681 				break;
682 
683 			return posix_error(status);
684 		}
685 	}
686 
687 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
688 
689 	if (numBytes < fReceiveQueue.Available())
690 		fReceiveList.Signal();
691 
692 	bool clone = (flags & MSG_PEEK);
693 
694 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
695 
696 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
697 
698 	// if we are opening the window, check if we should send an ACK
699 	if (!clone)
700 		SendAcknowledge(false);
701 
702 	return receivedBytes;
703 }
704 
705 
706 ssize_t
707 TCPEndpoint::ReadAvailable()
708 {
709 	MutexLocker locker(fLock);
710 
711 	TRACE("ReadAvailable(): %li", _AvailableData());
712 
713 	return _AvailableData();
714 }
715 
716 
717 status_t
718 TCPEndpoint::SetSendBufferSize(size_t length)
719 {
720 	MutexLocker _(fLock);
721 	fSendQueue.SetMaxBytes(length);
722 	return B_OK;
723 }
724 
725 
726 status_t
727 TCPEndpoint::SetReceiveBufferSize(size_t length)
728 {
729 	MutexLocker _(fLock);
730 	fReceiveQueue.SetMaxBytes(length);
731 	return B_OK;
732 }
733 
734 
735 status_t
736 TCPEndpoint::SetOption(int option, const void *_value, int length)
737 {
738 	if (option != TCP_NODELAY)
739 		return B_BAD_VALUE;
740 
741 	if (length != sizeof(int))
742 		return B_BAD_VALUE;
743 
744 	const int *value = (const int *)_value;
745 
746 	MutexLocker _(fLock);
747 	if (*value)
748 		fOptions |= TCP_NODELAY;
749 	else
750 		fOptions &= ~TCP_NODELAY;
751 
752 	return B_OK;
753 }
754 
755 
756 //	#pragma mark - misc
757 
758 
759 bool
760 TCPEndpoint::IsBound() const
761 {
762 	return !LocalAddress().IsEmpty(true);
763 }
764 
765 
766 void
767 TCPEndpoint::DeleteSocket()
768 {
769 	// the next call will delete `this'.
770 	gSocketModule->delete_socket(socket);
771 }
772 
773 
774 status_t
775 TCPEndpoint::DelayedAcknowledge()
776 {
777 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
778 		// timer was active, send an ACK now (with the exception above,
779 		// we send every other ACK)
780 		return SendAcknowledge(true);
781 	}
782 
783 	gStackModule->set_timer(&fDelayedAcknowledgeTimer, TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
784 	return B_OK;
785 }
786 
787 
788 status_t
789 TCPEndpoint::SendAcknowledge(bool force)
790 {
791 	return _SendQueued(force, 0);
792 }
793 
794 
795 void
796 TCPEndpoint::_StartPersistTimer()
797 {
798 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
799 }
800 
801 
802 void
803 TCPEndpoint::_EnterTimeWait()
804 {
805 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
806 }
807 
808 
809 status_t
810 TCPEndpoint::UpdateTimeWait()
811 {
812 	return B_OK;
813 }
814 
815 
816 //	#pragma mark - receive
817 
818 
819 int32
820 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer)
821 {
822 	TRACE("ListenReceive()");
823 
824 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
825 	// but the error behaviour differs
826 	if (segment.flags & TCP_FLAG_RESET)
827 		return DROP;
828 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
829 		return DROP | RESET;
830 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
831 		return DROP;
832 
833 	// TODO: drop broadcast/multicast
834 
835 	// spawn new endpoint for accept()
836 	net_socket *newSocket;
837 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK)
838 		return DROP;
839 
840 	return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this,
841 		segment, buffer);
842 }
843 
844 
845 int32
846 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment,
847 	net_buffer *buffer)
848 {
849 	MutexLocker _(fLock);
850 
851 	// TODO error checking
852 	ProtocolSocket::Open();
853 
854 	fState = SYNCHRONIZE_RECEIVED;
855 	fManager = parent->fManager;
856 
857 	LocalAddress().SetTo(buffer->destination);
858 	PeerAddress().SetTo(buffer->source);
859 
860 	TRACE("Spawn()");
861 
862 	// TODO: proper error handling!
863 	if (fManager->BindChild(this) < B_OK)
864 		return DROP;
865 
866 	if (_PrepareSendPath(*PeerAddress()) < B_OK)
867 		return DROP;
868 
869 	fOptions = parent->fOptions;
870 	fAcceptSemaphore = parent->fAcceptSemaphore;
871 
872 	_PrepareReceivePath(segment);
873 
874 	// send SYN+ACK
875 	if (_SendQueued() < B_OK)
876 		return DROP;
877 
878 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
879 		// we handled this flag now, it must not be set for further processing
880 
881 	return _Receive(segment, buffer);
882 }
883 
884 
885 void
886 TCPEndpoint::DumpInternalState() const
887 {
888 	kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder);
889 	kprintf("AcceptSem: %ld\n", fAcceptSemaphore);
890 	kprintf("Options: 0x%lx\n", (uint32)fOptions);
891 	kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift);
892 	kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift);
893 	kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged);
894 	kprintf("SendNext: %lu\n", (uint32)fSendNext);
895 	kprintf("SendMax: %lu\n", (uint32)fSendMax);
896 	kprintf("SendWindow: %lu\n", fSendWindow);
897 	kprintf("SendMaxWindow: %lu\n", fSendMaxWindow);
898 	kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize);
899 	kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
900 	kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent);
901 	kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence);
902 	kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount);
903 	kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext);
904 	kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised);
905 	kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow);
906 	kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize);
907 	kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(),
908 		fReceiveQueue.Size());
909 	kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence);
910 	kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime,
911 		fRoundTripDeviation);
912 	kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout);
913 	kprintf("CongestionWindow: %lu\n", fCongestionWindow);
914 	kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold);
915 	kprintf("State: %s\n", name_for_state(fState));
916 	kprintf("Flags: 0x%lx\n", fFlags);
917 }
918 
919 
920 int32
921 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, net_buffer *buffer)
922 {
923 	TRACE("SynchronizeSentReceive()");
924 
925 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
926 		&& (fInitialSendSequence >= segment.acknowledge
927 			|| fSendMax < segment.acknowledge))
928 		return DROP | RESET;
929 
930 	if (segment.flags & TCP_FLAG_RESET) {
931 		fError = ECONNREFUSED;
932 		fState = CLOSED;
933 		return DROP;
934 	}
935 
936 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
937 		return DROP;
938 
939 	fSendUnacknowledged = segment.acknowledge;
940 	_PrepareReceivePath(segment);
941 
942 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
943 		_MarkEstablished();
944 	} else {
945 		// simultaneous open
946 		fState = SYNCHRONIZE_RECEIVED;
947 	}
948 
949 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
950 		// we handled this flag now, it must not be set for further processing
951 
952 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
953 }
954 
955 
956 int32
957 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
958 {
959 	MutexLocker locker(fLock);
960 
961 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s",
962 		buffer, buffer->size, PrintAddress(buffer->source),
963 		PrintAddress(buffer->destination));
964 	TRACE("                   flags 0x%x, seq %lu, ack %lu, wnd %lu",
965 		segment.flags, segment.sequence, segment.acknowledge,
966 		(uint32)segment.advertised_window << fSendWindowShift);
967 
968 	int32 segmentAction = DROP;
969 
970 	switch (fState) {
971 		case LISTEN:
972 			segmentAction = _ListenReceive(segment, buffer);
973 			break;
974 
975 		case SYNCHRONIZE_SENT:
976 			segmentAction = _SynchronizeSentReceive(segment, buffer);
977 			break;
978 
979 		case SYNCHRONIZE_RECEIVED:
980 		case ESTABLISHED:
981 		case FINISH_RECEIVED:
982 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
983 		case FINISH_SENT:
984 		case FINISH_ACKNOWLEDGED:
985 		case CLOSING:
986 		case TIME_WAIT:
987 		case CLOSED:
988 			segmentAction = _SegmentReceived(segment, buffer);
989 			break;
990 	}
991 
992 	// process acknowledge action as asked for by the *Receive() method
993 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
994 		SendAcknowledge(true);
995 	else if (segmentAction & ACKNOWLEDGE)
996 		DelayedAcknowledge();
997 
998 	return segmentAction;
999 }
1000 
1001 int32
1002 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
1003 {
1004 	uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift;
1005 
1006 	// First, handle the most common case for uni-directional data transfer
1007 	// (known as header prediction - the segment must not change the window,
1008 	// and must be the expected sequence, and contain no control flags)
1009 
1010 	if (fState == ESTABLISHED
1011 		&& segment.AcknowledgeOnly()
1012 		&& fReceiveNext == segment.sequence
1013 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1014 		&& fSendNext == fSendMax) {
1015 
1016 		_UpdateTimestamps(segment, buffer->size);
1017 
1018 		if (buffer->size == 0) {
1019 			// this is a pure acknowledge segment - we're on the sending end
1020 			if (fSendUnacknowledged < segment.acknowledge
1021 				&& fSendMax >= segment.acknowledge) {
1022 				_Acknowledged(segment);
1023 				return DROP;
1024 			}
1025 		} else if (segment.acknowledge == fSendUnacknowledged
1026 			&& fReceiveQueue.IsContiguous()
1027 			&& fReceiveQueue.Free() >= buffer->size
1028 			&& !(fFlags & FLAG_NO_RECEIVE)) {
1029 			if (_AddData(segment, buffer))
1030 				_NotifyReader();
1031 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) ?
1032 				IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1033 		}
1034 	}
1035 
1036 	// The fast path was not applicable, so we continue with the standard
1037 	// processing of the incoming segment
1038 
1039 	if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) {
1040 		// 1. check sequence number
1041 		if (!segment_in_sequence(segment, buffer->size, fReceiveNext,
1042 				fReceiveWindow)) {
1043 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1044 				(uint32)fReceiveNext, fReceiveWindow);
1045 			if (segment.flags & TCP_FLAG_RESET)
1046 				return DROP;
1047 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1048 		}
1049 	}
1050 
1051 	return _Receive(segment, buffer);
1052 }
1053 
1054 
1055 //	#pragma mark - send
1056 
1057 
1058 inline uint8
1059 TCPEndpoint::_CurrentFlags()
1060 {
1061 	// we don't set FLAG_FINISH here, instead we do it
1062 	// conditionally below depending if we are sending
1063 	// the last bytes of the send queue.
1064 
1065 	switch (fState) {
1066 		case CLOSED:
1067 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1068 
1069 		case SYNCHRONIZE_SENT:
1070 			return TCP_FLAG_SYNCHRONIZE;
1071 		case SYNCHRONIZE_RECEIVED:
1072 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1073 
1074 		case ESTABLISHED:
1075 		case FINISH_RECEIVED:
1076 		case FINISH_ACKNOWLEDGED:
1077 		case TIME_WAIT:
1078 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1079 		case FINISH_SENT:
1080 		case CLOSING:
1081 			return TCP_FLAG_ACKNOWLEDGE;
1082 
1083 		default:
1084 			return B_ERROR;
1085 	}
1086 }
1087 
1088 
1089 inline bool
1090 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length,
1091 	uint32 segmentMaxSize, uint32 flightSize)
1092 {
1093 	if (length > 0) {
1094 		// Avoid the silly window syndrome - we only send a segment in case:
1095 		// - we have a full segment to send, or
1096 		// - we're at the end of our buffer queue, or
1097 		// - the buffer is at least larger than half of the maximum send window, or
1098 		// - we're retransmitting data
1099 		if (length == segmentMaxSize
1100 			|| (fOptions & TCP_NODELAY) != 0
1101 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1102 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1103 			return true;
1104 	}
1105 
1106 	// check if we need to send a window update to the peer
1107 	if (segment.advertised_window > 0) {
1108 		// correct the window to take into account what already has been advertised
1109 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1110 			- (fReceiveMaxAdvertised - fReceiveNext);
1111 
1112 		// if we can advertise a window larger than twice the maximum segment
1113 		// size, or half the maximum buffer size we send a window update
1114 		if (window >= (fReceiveMaxSegmentSize << 1)
1115 			|| window >= (socket->receive.buffer_size >> 1))
1116 			return true;
1117 	}
1118 
1119 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH | TCP_FLAG_RESET)) != 0)
1120 		return true;
1121 
1122 	// there is no reason to send a segment just now
1123 	return false;
1124 }
1125 
1126 
1127 status_t
1128 TCPEndpoint::_SendQueued(bool force)
1129 {
1130 	return _SendQueued(force, fSendWindow);
1131 }
1132 
1133 
1134 /*!
1135 	Sends one or more TCP segments with the data waiting in the queue, or some
1136 	specific flags that need to be sent.
1137 */
1138 status_t
1139 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1140 {
1141 	if (fRoute == NULL)
1142 		return B_ERROR;
1143 
1144 	// in passive state?
1145 	if (fState == LISTEN)
1146 		return B_ERROR;
1147 
1148 	tcp_segment_header segment(_CurrentFlags());
1149 
1150 	if ((fOptions & TCP_NOOPT) == 0) {
1151 		if (fFlags & FLAG_OPTION_TIMESTAMP) {
1152 			segment.options |= TCP_HAS_TIMESTAMPS;
1153 			segment.timestamp_reply = fReceivedTimestamp;
1154 			segment.timestamp_value = tcp_now();
1155 		}
1156 
1157 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE)
1158 			&& (fSendNext == fInitialSendSequence)) {
1159 			// add connection establishment options
1160 			segment.max_segment_size = fReceiveMaxSegmentSize;
1161 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1162 				segment.options |= TCP_HAS_WINDOW_SCALE;
1163 				segment.window_shift = fReceiveWindowShift;
1164 			}
1165 		}
1166 	}
1167 
1168 	size_t availableBytes = fReceiveQueue.Free();
1169 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1170 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1171 	else
1172 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1173 
1174 	segment.acknowledge = fReceiveNext;
1175 	segment.urgent_offset = 0;
1176 
1177 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1178 		sendWindow = fCongestionWindow;
1179 
1180 	// SND.UNA  SND.NXT        SND.MAX
1181 	//  |        |              |
1182 	//  v        v              v
1183 	//  -----------------------------------
1184 	//  | effective window           |
1185 	//  -----------------------------------
1186 
1187 	// Flight size represents the window of data which is currently in the
1188 	// ether. We should never send data such as the flight size becomes larger
1189 	// than the effective window. Note however that the effective window may be
1190 	// reduced (by congestion for instance), so at some point in time flight
1191 	// size may be larger than the currently calculated window.
1192 
1193 	uint32 flightSize = fSendMax - fSendUnacknowledged;
1194 	uint32 consumedWindow = fSendNext - fSendUnacknowledged;
1195 
1196 	if (consumedWindow > sendWindow) {
1197 		sendWindow = 0;
1198 		// TODO enter persist state? try to get a window update.
1199 	} else
1200 		sendWindow -= consumedWindow;
1201 
1202 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1203 		// send one byte of data to ask for a window update
1204 		// (triggered by the persist timer)
1205 		sendWindow = 1;
1206 	}
1207 
1208 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1209 	tcp_sequence previousSendNext = fSendNext;
1210 
1211 	do {
1212 		uint32 segmentMaxSize = fSendMaxSegmentSize
1213 			- tcp_options_length(segment);
1214 		uint32 segmentLength = min_c(length, segmentMaxSize);
1215 
1216 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1217 			if (state_needs_finish(fState))
1218 				segment.flags |= TCP_FLAG_FINISH;
1219 			if (length > 0)
1220 				segment.flags |= TCP_FLAG_PUSH;
1221 		}
1222 
1223 		// Determine if we should really send this segment
1224 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1225 			segmentMaxSize, flightSize)) {
1226 			if (fSendQueue.Available()
1227 				&& !gStackModule->is_timer_active(&fPersistTimer)
1228 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1229 				_StartPersistTimer();
1230 			break;
1231 		}
1232 
1233 		net_buffer *buffer = gBufferModule->create(256);
1234 		if (buffer == NULL)
1235 			return B_NO_MEMORY;
1236 
1237 		status_t status = B_OK;
1238 		if (segmentLength > 0)
1239 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
1240 		if (status < B_OK) {
1241 			gBufferModule->free(buffer);
1242 			return status;
1243 		}
1244 
1245 		LocalAddress().CopyTo(buffer->source);
1246 		PeerAddress().CopyTo(buffer->destination);
1247 
1248 		uint32 size = buffer->size;
1249 		segment.sequence = fSendNext;
1250 
1251 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s",
1252 			buffer, buffer->size, PrintAddress(buffer->source),
1253 			PrintAddress(buffer->destination));
1254 		TRACE("              flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu"
1255 			", ssthresh %lu", segment.flags, segment.sequence,
1256 			segment.acknowledge, segment.advertised_window,
1257 			fCongestionWindow, fSlowStartThreshold);
1258 		TRACE("              len %lu first %lu last %lu", segmentLength,
1259 			(uint32)fSendQueue.FirstSequence(),
1260 			(uint32)fSendQueue.LastSequence());
1261 
1262 		PROBE(buffer, sendWindow);
1263 		sendWindow -= buffer->size;
1264 
1265 		status = add_tcp_header(AddressModule(), segment, buffer);
1266 		if (status != B_OK) {
1267 			gBufferModule->free(buffer);
1268 			return status;
1269 		}
1270 
1271 		// Update send status - we need to do this before we send the data
1272 		// for local connections as the answer is directly handled
1273 
1274 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
1275 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
1276 			segment.max_segment_size = 0;
1277 			size++;
1278 		}
1279 
1280 		if (segment.flags & TCP_FLAG_FINISH)
1281 			size++;
1282 
1283 		uint32 sendMax = fSendMax;
1284 		fSendNext += size;
1285 		if (fSendMax < fSendNext)
1286 			fSendMax = fSendNext;
1287 
1288 		fReceiveMaxAdvertised = fReceiveNext
1289 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
1290 
1291 		status = next->module->send_routed_data(next, fRoute, buffer);
1292 		if (status < B_OK) {
1293 			gBufferModule->free(buffer);
1294 
1295 			fSendNext = segment.sequence;
1296 			fSendMax = sendMax;
1297 				// restore send status
1298 			return status;
1299 		}
1300 
1301 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1302 			fLastAcknowledgeSent = segment.acknowledge;
1303 
1304 		length -= segmentLength;
1305 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET | TCP_FLAG_FINISH);
1306 	} while (length > 0);
1307 
1308 	// if we sent data from the beggining of the send queue,
1309 	// start the retransmition timer
1310 	if (previousSendNext == fSendUnacknowledged
1311 		&& fSendNext > previousSendNext) {
1312 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
1313 			fRetransmitTimeout);
1314 
1315 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
1316 	}
1317 
1318 	return B_OK;
1319 }
1320 
1321 
1322 int
1323 TCPEndpoint::_MaxSegmentSize(const sockaddr *address) const
1324 {
1325 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
1326 }
1327 
1328 
1329 status_t
1330 TCPEndpoint::_ShutdownEgress(bool closing)
1331 {
1332 	tcp_state previousState = fState;
1333 
1334 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1335 		fState = FINISH_SENT;
1336 	else if (fState == FINISH_RECEIVED)
1337 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1338 	else
1339 		return B_OK;
1340 
1341 	status_t status = _SendQueued();
1342 	if (status != B_OK) {
1343 		fState = previousState;
1344 		return status;
1345 	}
1346 
1347 	return B_OK;
1348 }
1349 
1350 
1351 ssize_t
1352 TCPEndpoint::_AvailableData() const
1353 {
1354 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1355 	//       the application of FLAG_NO_RECEIVE in listen()ing
1356 	//       sockets.
1357 	if (fState == LISTEN)
1358 		return gSocketModule->count_connected(socket);
1359 	else if (fState == SYNCHRONIZE_SENT)
1360 		return 0;
1361 
1362 	ssize_t availableData = fReceiveQueue.Available();
1363 
1364 	if (availableData == 0 && !_ShouldReceive())
1365 		return ENOTCONN;
1366 
1367 	return availableData;
1368 }
1369 
1370 
1371 void
1372 TCPEndpoint::_NotifyReader()
1373 {
1374 	fReceiveList.Signal();
1375 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1376 }
1377 
1378 
1379 bool
1380 TCPEndpoint::_ShouldReceive() const
1381 {
1382 	if (fFlags & FLAG_NO_RECEIVE)
1383 		return false;
1384 
1385 	return fState == ESTABLISHED || fState == FINISH_SENT
1386 			|| fState == FINISH_ACKNOWLEDGED;
1387 }
1388 
1389 
1390 int32
1391 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer)
1392 {
1393 	uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift;
1394 
1395 	size_t segmentLength = buffer->size;
1396 
1397 	if (segment.flags & TCP_FLAG_RESET) {
1398 		// is this a valid reset?
1399 		if (fLastAcknowledgeSent <= segment.sequence
1400 			&& tcp_sequence(segment.sequence)
1401 				< (fLastAcknowledgeSent + fReceiveWindow)) {
1402 			if (fState == SYNCHRONIZE_RECEIVED)
1403 				fError = ECONNREFUSED;
1404 			else if (fState == CLOSING || fState == TIME_WAIT
1405 					|| fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1406 				fError = ENOTCONN;
1407 			else
1408 				fError = ECONNRESET;
1409 
1410 			_NotifyReader();
1411 			fState = CLOSED;
1412 		}
1413 
1414 		return DROP;
1415 	}
1416 
1417 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1418 		|| (fState == SYNCHRONIZE_RECEIVED
1419 			&& (fInitialReceiveSequence > segment.sequence
1420 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1421 					&& (fSendUnacknowledged > segment.acknowledge
1422 						|| fSendMax < segment.acknowledge)))) {
1423 		// reset the connection - either the initial SYN was faulty, or we
1424 		// received a SYN within the data stream
1425 		return DROP | RESET;
1426 	}
1427 
1428 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1429 		// the window must not shrink
1430 
1431 	// trim buffer to be within the receive window
1432 	int32 drop = fReceiveNext - segment.sequence;
1433 	if (drop > 0) {
1434 		if ((uint32)drop > buffer->size
1435 			|| ((uint32)drop == buffer->size
1436 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1437 			// don't accidently remove a FIN we shouldn't remove
1438 			segment.flags &= ~TCP_FLAG_FINISH;
1439 			drop = buffer->size;
1440 		}
1441 
1442 		// remove duplicate data at the start
1443 		TRACE("* remove %ld bytes from the start", drop);
1444 		gBufferModule->remove_header(buffer, drop);
1445 		segment.sequence += drop;
1446 	}
1447 
1448 	int32 action = KEEP;
1449 
1450 	drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow);
1451 	if (drop > 0) {
1452 		// remove data exceeding our window
1453 		if ((uint32)drop >= buffer->size) {
1454 			// if we can accept data, or the segment is not what we'd expect,
1455 			// drop the segment (an immediate acknowledge is always triggered)
1456 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1457 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1458 
1459 			action |= IMMEDIATE_ACKNOWLEDGE;
1460 		}
1461 
1462 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1463 			// we need to remove the finish, too, as part of the data
1464 			drop--;
1465 		}
1466 
1467 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1468 		TRACE("* remove %ld bytes from the end", drop);
1469 		gBufferModule->remove_trailer(buffer, drop);
1470 	}
1471 
1472 	if (advertisedWindow > fSendWindow)
1473 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1474 			advertisedWindow);
1475 
1476 	fSendWindow = advertisedWindow;
1477 	if (advertisedWindow > fSendMaxWindow)
1478 		fSendMaxWindow = advertisedWindow;
1479 
1480 	// Then look at the acknowledgement for any updates
1481 
1482 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1483 		// process acknowledged data
1484 		if (fState == SYNCHRONIZE_RECEIVED)
1485 			_MarkEstablished();
1486 
1487 		if (fSendMax < segment.acknowledge || fState == TIME_WAIT)
1488 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1489 
1490 		if (segment.acknowledge < fSendUnacknowledged) {
1491 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1492 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1493 				TRACE("Receive(): duplicate ack!");
1494 
1495 				_DuplicateAcknowledge(segment);
1496 			}
1497 
1498 			return DROP;
1499 		} else {
1500 			// this segment acknowledges in flight data
1501 
1502 			if (fDuplicateAcknowledgeCount >= 3) {
1503 				// deflate the window.
1504 				fCongestionWindow = fSlowStartThreshold;
1505 			}
1506 
1507 			fDuplicateAcknowledgeCount = 0;
1508 
1509 			if (fSendMax == segment.acknowledge)
1510 				TRACE("Receive(): all inflight data ack'd!");
1511 
1512 			if (segment.acknowledge > fSendQueue.LastSequence()
1513 					&& fState > ESTABLISHED) {
1514 				TRACE("Receive(): FIN has been acknowledged!");
1515 
1516 				switch (fState) {
1517 					case FINISH_SENT:
1518 						fState = FINISH_ACKNOWLEDGED;
1519 						break;
1520 					case CLOSING:
1521 						fState = TIME_WAIT;
1522 						_EnterTimeWait();
1523 						return DROP;
1524 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1525 						fState = CLOSED;
1526 						break;
1527 
1528 					default:
1529 						break;
1530 				}
1531 			}
1532 
1533 			if (fState != CLOSED)
1534 				_Acknowledged(segment);
1535 		}
1536 	}
1537 
1538 	if (segment.flags & TCP_FLAG_URGENT) {
1539 		if (fState == ESTABLISHED || fState == FINISH_SENT
1540 			|| fState == FINISH_ACKNOWLEDGED) {
1541 			// TODO: Handle urgent data:
1542 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1543 			//  - signal the user that urgent data is available (SIGURG)
1544 		}
1545 	}
1546 
1547 	bool notify = false;
1548 
1549 	if (buffer->size > 0 &&	_ShouldReceive())
1550 		notify = _AddData(segment, buffer);
1551 	else
1552 		action = (action & ~KEEP) | DROP;
1553 
1554 	if (segment.flags & TCP_FLAG_FINISH) {
1555 		segmentLength++;
1556 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1557 			TRACE("Receive(): peer is finishing connection!");
1558 			fReceiveNext++;
1559 			notify = true;
1560 
1561 			// FIN implies PSH
1562 			fReceiveQueue.SetPushPointer();
1563 
1564 			// we'll reply immediatly to the FIN if we are not
1565 			// transitioning to TIME WAIT so we immediatly ACK it.
1566 			action |= IMMEDIATE_ACKNOWLEDGE;
1567 
1568 			// other side is closing connection; change states
1569 			switch (fState) {
1570 				case ESTABLISHED:
1571 				case SYNCHRONIZE_RECEIVED:
1572 					fState = FINISH_RECEIVED;
1573 					break;
1574 				case FINISH_SENT:
1575 					// simultaneous close
1576 					fState = CLOSING;
1577 					break;
1578 				case FINISH_ACKNOWLEDGED:
1579 					fState = TIME_WAIT;
1580 					_EnterTimeWait();
1581 					break;
1582 				default:
1583 					break;
1584 			}
1585 		}
1586 	}
1587 
1588 	if (notify)
1589 		_NotifyReader();
1590 
1591 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1592 		action |= ACKNOWLEDGE;
1593 
1594 	_UpdateTimestamps(segment, segmentLength);
1595 
1596 	TRACE("Receive() Action %ld", action);
1597 
1598 	return action;
1599 }
1600 
1601 
1602 void
1603 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment,
1604 	size_t segmentLength)
1605 {
1606 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1607 		tcp_sequence sequence(segment.sequence);
1608 
1609 		if ((fLastAcknowledgeSent >= sequence
1610 				&& fLastAcknowledgeSent < (sequence + segmentLength)))
1611 			fReceivedTimestamp = segment.timestamp_value;
1612 	}
1613 }
1614 
1615 
1616 void
1617 TCPEndpoint::_MarkEstablished()
1618 {
1619 	fState = ESTABLISHED;
1620 
1621 	if (socket->parent != NULL) {
1622 		gSocketModule->set_connected(socket);
1623 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1624 	}
1625 
1626 	fSendList.Signal();
1627 }
1628 
1629 
1630 status_t
1631 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1632 {
1633 	while (fState != ESTABLISHED) {
1634 		status_t status = fSendList.Wait(locker, timeout);
1635 		if (status < B_OK)
1636 			return status;
1637 	}
1638 
1639 	return B_OK;
1640 }
1641 
1642 
1643 bool
1644 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer)
1645 {
1646 	fReceiveQueue.Add(buffer, segment.sequence);
1647 	fReceiveNext = fReceiveQueue.NextSequence();
1648 
1649 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1650 		(uint32)fReceiveNext, fReceiveQueue.Available());
1651 
1652 	if (segment.flags & TCP_FLAG_PUSH)
1653 		fReceiveQueue.SetPushPointer();
1654 
1655 	return fReceiveQueue.Available() > 0;
1656 }
1657 
1658 
1659 void
1660 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment)
1661 {
1662 	fInitialReceiveSequence = segment.sequence;
1663 
1664 	// count the received SYN
1665 	segment.sequence++;
1666 
1667 	fReceiveNext = segment.sequence;
1668 	fReceiveQueue.SetInitialSequence(segment.sequence);
1669 
1670 	if ((fOptions & TCP_NOOPT) == 0) {
1671 		if (segment.max_segment_size > 0)
1672 			fSendMaxSegmentSize = segment.max_segment_size;
1673 
1674 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1675 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1676 			fSendWindowShift = segment.window_shift;
1677 		} else {
1678 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1679 			fReceiveWindowShift = 0;
1680 		}
1681 
1682 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1683 			fFlags |= FLAG_OPTION_TIMESTAMP;
1684 			fReceivedTimestamp = segment.timestamp_value;
1685 		} else
1686 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1687 	}
1688 
1689 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1690 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1691 }
1692 
1693 
1694 status_t
1695 TCPEndpoint::_PrepareSendPath(const sockaddr *peer)
1696 {
1697 	if (fRoute == NULL) {
1698 		fRoute = gDatalinkModule->get_route(Domain(), peer);
1699 		if (fRoute == NULL)
1700 			return ENETUNREACH;
1701 	}
1702 
1703 	// make sure connection does not already exist
1704 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
1705 		fRoute->interface->address);
1706 	if (status < B_OK)
1707 		return status;
1708 
1709 	fInitialSendSequence = system_time() >> 4;
1710 	fSendNext = fInitialSendSequence;
1711 	fSendUnacknowledged = fInitialSendSequence;
1712 	fSendMax = fInitialSendSequence;
1713 
1714 	// we are counting the SYN here
1715 	fSendQueue.SetInitialSequence(fSendNext + 1);
1716 
1717 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
1718 
1719 	// Compute the window shift we advertise to our peer - if it doesn't support
1720 	// this option, this will be reset to 0 (when its SYN is received)
1721 	fReceiveWindowShift = 0;
1722 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
1723 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
1724 		fReceiveWindowShift++;
1725 	}
1726 
1727 	return B_OK;
1728 }
1729 
1730 
1731 void
1732 TCPEndpoint::_Acknowledged(tcp_segment_header &segment)
1733 {
1734 	size_t previouslyUsed = fSendQueue.Used();
1735 
1736 	fSendQueue.RemoveUntil(segment.acknowledge);
1737 	fSendUnacknowledged = segment.acknowledge;
1738 
1739 	if (fSendNext < fSendUnacknowledged)
1740 		fSendNext = fSendUnacknowledged;
1741 
1742 	if (fSendUnacknowledged == fSendMax)
1743 		gStackModule->cancel_timer(&fRetransmitTimer);
1744 
1745 	if (fSendQueue.Used() < previouslyUsed) {
1746 		// this ACK acknowledged data
1747 
1748 		if (segment.options & TCP_HAS_TIMESTAMPS)
1749 			_UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply));
1750 		else {
1751 			// TODO Fallback to RFC 793 type estimation
1752 		}
1753 
1754 		if (is_writable(fState)) {
1755 			// notify threads waiting on the socket to become writable again
1756 			fSendList.Signal();
1757 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
1758 		}
1759 
1760 		if (fCongestionWindow < fSlowStartThreshold)
1761 			fCongestionWindow += fSendMaxSegmentSize;
1762 	}
1763 
1764 	if (fCongestionWindow >= fSlowStartThreshold) {
1765 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
1766 
1767 		if (increment < fCongestionWindow)
1768 			increment = 1;
1769 		else
1770 			increment /= fCongestionWindow;
1771 
1772 		fCongestionWindow += increment;
1773 	}
1774 
1775 	// if there is data left to be send, send it now
1776 	if (fSendQueue.Used() > 0)
1777 		_SendQueued();
1778 }
1779 
1780 
1781 void
1782 TCPEndpoint::_Retransmit()
1783 {
1784 	TRACE("Retransmit()");
1785 	_ResetSlowStart();
1786 	fSendNext = fSendUnacknowledged;
1787 	_SendQueued();
1788 }
1789 
1790 
1791 void
1792 TCPEndpoint::_UpdateSRTT(int32 roundTripTime)
1793 {
1794 	int32 rtt = roundTripTime;
1795 
1796 	// Update_SRTT() as per Van Jacobson
1797 	rtt -= (fRoundTripTime / 8);
1798 	fRoundTripTime += rtt;
1799 	if (rtt < 0)
1800 		rtt = -rtt;
1801 	rtt -= (fRoundTripDeviation / 4);
1802 	fRoundTripDeviation += rtt;
1803 
1804 	fRetransmitTimeout = ((fRoundTripTime / 4 +
1805 		fRoundTripDeviation) / 2) * kTimestampFactor;
1806 
1807 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
1808 		roundTripTime);
1809 }
1810 
1811 
1812 void
1813 TCPEndpoint::_ResetSlowStart()
1814 {
1815 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2,
1816 		2 * fSendMaxSegmentSize);
1817 	fCongestionWindow = fSendMaxSegmentSize;
1818 }
1819 
1820 
1821 void
1822 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1823 {
1824 	fDuplicateAcknowledgeCount++;
1825 
1826 	if (fDuplicateAcknowledgeCount < 3)
1827 		return;
1828 	else if (fDuplicateAcknowledgeCount == 3) {
1829 		_ResetSlowStart();
1830 		fCongestionWindow = fSlowStartThreshold + 3
1831 			* fSendMaxSegmentSize;
1832 		fSendNext = segment.acknowledge;
1833 	} else if (fDuplicateAcknowledgeCount > 3)
1834 		fCongestionWindow += fSendMaxSegmentSize;
1835 
1836 	_SendQueued();
1837 }
1838 
1839 
1840 //	#pragma mark - timer
1841 
1842 
1843 /*static*/ void
1844 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data)
1845 {
1846 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1847 
1848 	MutexLocker locker(endpoint->fLock);
1849 	if (!locker.IsLocked())
1850 		return;
1851 
1852 	endpoint->_Retransmit();
1853 }
1854 
1855 
1856 /*static*/ void
1857 TCPEndpoint::_PersistTimer(net_timer *timer, void *data)
1858 {
1859 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1860 
1861 	MutexLocker locker(endpoint->fLock);
1862 	if (!locker.IsLocked())
1863 		return;
1864 
1865 	endpoint->_SendQueued(true);
1866 }
1867 
1868 
1869 /*static*/ void
1870 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data)
1871 {
1872 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1873 
1874 	MutexLocker locker(endpoint->fLock);
1875 	if (!locker.IsLocked())
1876 		return;
1877 
1878 	endpoint->SendAcknowledge(true);
1879 }
1880 
1881 
1882 /*static*/ void
1883 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data)
1884 {
1885 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1886 
1887 	if (mutex_lock(&endpoint->fLock) < B_OK)
1888 		return;
1889 
1890 	endpoint->DeleteSocket();
1891 }
1892 
1893